Add telemetry to Vault agent (#13675)

This patch adds a new /agent/v1/metrics that will return metrics on the
running Vault agent. Configuration is done using the same telemetry
stanza as the Vault server. For now default runtime metrics are
returned with a few additional ones specific to the agent:
  - `vault.agent.auth.failure` and `vault.agent.auth.success` to monitor
  the correct behavior of the auto auth mechanism
  - `vault.agent.proxy.success`, `vault.agent.proxy.client_error` and
  `vault.agent.proxy.error` to check the connection with the Vault server
  - `vault.agent.cache.hit` and `vault.agent.cache.miss` to monitor the
  cache

Closes https://github.com/hashicorp/vault/issues/8649

Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
This commit is contained in:
Rémi Lapeyre 2022-02-18 02:10:26 +01:00 committed by GitHub
parent 880a8143e0
commit 98b18ee08e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 211 additions and 15 deletions

View File

@ -225,7 +225,7 @@ func (r *LifetimeWatcher) Start() {
r.doneCh <- r.doRenew()
}
// Renew is for comnpatibility with the legacy api.Renewer. Calling Renew
// Renew is for compatibility with the legacy api.Renewer. Calling Renew
// simply chains to Start.
func (r *LifetimeWatcher) Renew() {
r.Start()

4
changelog/13675.txt Normal file
View File

@ -0,0 +1,4 @@
```release-note:feature
agent: The Vault agent now returns telemetry information at the `/agent/v1/metrics`
path.
```

View File

@ -40,10 +40,12 @@ import (
"github.com/hashicorp/vault/command/agent/sink/inmem"
"github.com/hashicorp/vault/command/agent/template"
"github.com/hashicorp/vault/command/agent/winsvc"
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/internalshared/configutil"
"github.com/hashicorp/vault/internalshared/listenerutil"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/helper/logging"
"github.com/hashicorp/vault/sdk/helper/useragent"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/version"
"github.com/kr/pretty"
@ -68,6 +70,9 @@ type AgentCommand struct {
logGate *gatedwriter.Writer
logger log.Logger
// Telemetry object
metricsHelper *metricsutil.MetricsHelper
cleanupGuard sync.Once
startedCh chan (struct{}) // for tests
@ -340,6 +345,21 @@ func (c *AgentCommand) Run(args []string) int {
ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()
// telemetry configuration
inmemMetrics, _, prometheusEnabled, err := configutil.SetupTelemetry(&configutil.SetupTelemetryOpts{
Config: config.Telemetry,
Ui: c.UI,
ServiceName: "vault",
DisplayName: "Vault",
UserAgent: useragent.String(),
ClusterName: config.ClusterName,
})
if err != nil {
c.UI.Error(fmt.Sprintf("Error initializing telemetry: %s", err))
return 1
}
c.metricsHelper = metricsutil.NewMetricsHelper(inmemMetrics, prometheusEnabled)
var method auth.AuthMethod
var sinks []*sink.SinkConfig
var templateNamespace string
@ -696,6 +716,7 @@ func (c *AgentCommand) Run(args []string) int {
// Create a muxer and add paths relevant for the lease cache layer
mux := http.NewServeMux()
mux.Handle(consts.AgentPathCacheClear, leaseCache.HandleCacheClear(ctx))
mux.Handle(consts.AgentPathMetrics, c.handleMetrics())
mux.Handle("/", muxHandler)
scheme := "https://"
@ -990,3 +1011,39 @@ func getServiceAccountJWT(tokenFile string) (string, error) {
}
return strings.TrimSpace(string(token)), nil
}
func (c *AgentCommand) handleMetrics() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
logical.RespondError(w, http.StatusMethodNotAllowed, nil)
return
}
if err := r.ParseForm(); err != nil {
logical.RespondError(w, http.StatusBadRequest, err)
return
}
format := r.Form.Get("format")
if format == "" {
format = metricsutil.FormatFromRequest(&logical.Request{
Headers: r.Header,
})
}
resp := c.metricsHelper.ResponseForFormat(format)
status := resp.Data[logical.HTTPStatusCode].(int)
w.Header().Set("Content-Type", resp.Data[logical.HTTPContentType].(string))
switch v := resp.Data[logical.HTTPRawBody].(type) {
case string:
w.WriteHeader((status))
w.Write([]byte(v))
case []byte:
w.WriteHeader(status)
w.Write(v)
default:
logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("wrong response returned"))
}
})
}

View File

@ -8,6 +8,7 @@ import (
"net/http"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
@ -156,6 +157,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error creating client for authentication call", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
default:
@ -174,6 +176,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("could not look up token", "err", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
@ -190,6 +193,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error getting path or data from method", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
}
@ -199,6 +203,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error creating client for wrapped call", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
wrapClient.SetWrappingLookupFunc(func(string, string) string {
@ -220,6 +225,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error authenticating", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
}
@ -229,17 +235,20 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if secret.WrapInfo == nil {
ah.logger.Error("authentication returned nil wrap info", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
if secret.WrapInfo.Token == "" {
ah.logger.Error("authentication returned empty wrapped client token", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
wrappedResp, err := jsonutil.EncodeJSON(secret.WrapInfo)
if err != nil {
ah.logger.Error("failed to encode wrapinfo", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
ah.logger.Info("authentication successful, sending wrapped token to sinks and pausing")
@ -265,11 +274,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if secret == nil || secret.Auth == nil {
ah.logger.Error("authentication returned nil auth info", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
if secret.Auth.ClientToken == "" {
ah.logger.Error("authentication returned empty client token", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
ah.logger.Info("authentication successful, sending token to sinks")
@ -292,11 +303,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error creating lifetime watcher, backing off and retrying", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
// Start the renewal process
ah.logger.Info("starting renewal process")
metrics.IncrCounter([]string{"agent", "auth", "success"}, 1)
go watcher.Renew()
LifetimeWatcherLoop:
@ -310,11 +323,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
case err := <-watcher.DoneCh():
ah.logger.Info("lifetime watcher done channel triggered")
if err != nil {
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
ah.logger.Error("error renewing token", "error", err)
}
break LifetimeWatcherLoop
case <-watcher.RenewCh():
metrics.IncrCounter([]string{"agent", "auth", "success"}, 1)
ah.logger.Info("renewed auth token")
case <-credCh:

View File

@ -12,6 +12,7 @@ import (
"net/http"
"time"
"github.com/armon/go-metrics"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/command/agent/sink"
@ -58,7 +59,9 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin
copyHeader(w.Header(), resp.Response.Header)
w.WriteHeader(resp.Response.StatusCode)
io.Copy(w, resp.Response.Body)
metrics.IncrCounter([]string{"agent", "proxy", "client_error"}, 1)
} else {
metrics.IncrCounter([]string{"agent", "proxy", "error"}, 1)
logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("failed to get the response: %w", err))
}
return
@ -72,6 +75,15 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin
defer resp.Response.Body.Close()
metrics.IncrCounter([]string{"agent", "proxy", "success"}, 1)
if resp.CacheMeta != nil {
if resp.CacheMeta.Hit {
metrics.IncrCounter([]string{"agent", "cache", "hit"}, 1)
} else {
metrics.IncrCounter([]string{"agent", "cache", "miss"}, 1)
}
}
// Set headers
setHeaders(w, resp)

View File

@ -26,6 +26,7 @@ import (
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/vault"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/require"
)
func testAgentCommand(tb testing.TB, logger hclog.Logger) (*cli.MockUi, *AgentCommand) {
@ -614,7 +615,7 @@ func TestAgent_RequireAutoAuthWithForce(t *testing.T) {
// Create a config file
config := `
cache {
use_auto_auth_token = "force"
use_auto_auth_token = "force"
}
listener "tcp" {
@ -1999,3 +2000,85 @@ vault {
})
}
}
func TestAgent_Metrics(t *testing.T) {
//----------------------------------------------------
// Start the server and agent
//----------------------------------------------------
// Start a vault server
logger := logging.NewVaultLogger(hclog.Trace)
cluster := vault.NewTestCluster(t,
&vault.CoreConfig{
Logger: logger,
},
&vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
})
cluster.Start()
defer cluster.Cleanup()
vault.TestWaitActive(t, cluster.Cores[0].Core)
serverClient := cluster.Cores[0].Client
// Create a config file
config := `
cache {}
listener "tcp" {
address = "127.0.0.1:8101"
tls_disable = true
}
`
configPath := makeTempFile(t, "config.hcl", config)
defer os.Remove(configPath)
// Start the agent
ui, cmd := testAgentCommand(t, logger)
cmd.client = serverClient
cmd.startedCh = make(chan struct{})
wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
code := cmd.Run([]string{"-config", configPath})
if code != 0 {
t.Errorf("non-zero return code when running agent: %d", code)
t.Logf("STDOUT from agent:\n%s", ui.OutputWriter.String())
t.Logf("STDERR from agent:\n%s", ui.ErrorWriter.String())
}
wg.Done()
}()
select {
case <-cmd.startedCh:
case <-time.After(5 * time.Second):
t.Errorf("timeout")
}
// defer agent shutdown
defer func() {
cmd.ShutdownCh <- struct{}{}
wg.Wait()
}()
conf := api.DefaultConfig()
conf.Address = "http://127.0.0.1:8101"
agentClient, err := api.NewClient(conf)
if err != nil {
t.Fatalf("err: %s", err)
}
req := agentClient.NewRequest("GET", "/agent/v1/metrics")
body := request(t, agentClient, req, 200)
keys := []string{}
for k := range body {
keys = append(keys, k)
}
require.ElementsMatch(t, keys, []string{
"Counters",
"Samples",
"Timestamp",
"Gauges",
"Points",
})
}

View File

@ -3,3 +3,7 @@ package consts
// AgentPathCacheClear is the path that the agent will use as its cache-clear
// endpoint.
const AgentPathCacheClear = "/agent/v1/cache-clear"
// AgentPathMetrics is the path the the agent will use to expose its internal
// metrics.
const AgentPathMetrics = "/agent/v1/metrics"

View File

@ -39,12 +39,12 @@ import (
// Fetches a key-value secret (kv-v2) after authenticating via AppRole
func getSecretWithAppRole() (string, error) {
config := vault.DefaultConfig()
config := vault.DefaultConfig()
client := vault.NewClient(config)
wrappingToken := ioutil.ReadFile("path/to/wrapping-token")
wrappingToken := ioutil.ReadFile("path/to/wrapping-token")
unwrappedToken := client.Logical().Unwrap(strings.TrimSuffix(string(wrappingToken), "\n"))
secretID := unwrappedToken.Data["secret_id"]
roleID := os.Getenv("APPROLE_ROLE_ID")
@ -61,7 +61,7 @@ func getSecretWithAppRole() (string, error) {
}
data := secret.Data["data"].(map[string]interface{})
...snip...
}
```
@ -74,7 +74,7 @@ Vault. However, in other situations where you have a large number of
applications, as in large enterprises, you may not have the resources or expertise
to update and maintain the Vault integration code for every application. When
third party applications are being deployed by the application, it is prohibited
to add the Vault integration code.
to add the Vault integration code.
Vault Agent aims to remove this initial hurdle to adopt Vault by providing a
more scalable and simpler way for applications to integrate with Vault.
@ -84,14 +84,14 @@ more scalable and simpler way for applications to integrate with Vault.
Vault Agent is a client daemon that provides the following features:
- [Auto-Auth][autoauth] - Automatically authenticate to Vault and manage the
- [Auto-Auth][autoauth] - Automatically authenticate to Vault and manage the
token renewal process for locally-retrieved dynamic secrets.
- [Caching][caching] - Allows client-side caching of responses containing newly
- [Caching][caching] - Allows client-side caching of responses containing newly
created tokens and responses containing leased secrets generated off of these
newly created tokens. The agent also manages the renewals of the cached tokens and leases.
- [Windows Service][winsvc] - Allows running the Vault Agent as a Windows
- [Windows Service][winsvc] - Allows running the Vault Agent as a Windows
service.
- [Templating][template] - Allows rendering of user-supplied templates by Vault
- [Templating][template] - Allows rendering of user-supplied templates by Vault
Agent, using the token generated by the Auto-Auth step.
@ -132,6 +132,10 @@ These are the currently-available general configuration option:
- `template_config` <code>([template_config][template-config]: <optional\>)</code> - Specifies templating engine behavior.
- `telemetry` <code>([telemetry][telemetry]: <optional\>)</code> Specifies the telemetry
reporting system. See the [telemetry Stanza](/docs/agent#telemetry-stanza) section below
for a list of metrics specific to Agent.
### vault Stanza
There can at most be one top level `vault` block and it has the following
@ -218,16 +222,32 @@ supports an additional optional entry:
Request Forgery attacks. Requests on the listener that do not have the proper
`X-Vault-Request` header will fail, with a HTTP response status code of `412: Precondition Failed`.
### telemetry Stanza
## Start Vault Agent
Vault Agent supports the [telemetry][telemetry] stanza and collects various
runtime metrics about its performance, the auto-auth and the cache status:
To run Vault Agent:
| Metric | Description | Type |
| -------------------------------- | ---------------------------------------------------- | ------- |
| `vault.agent.auth.failure` | Number of authentication failures | counter |
| `vault.agent.auth.success` | Number of authentication successes | counter |
| `vault.agent.proxy.success` | Number of requests successfully proxied | counter |
| `vault.agent.proxy.client_error` | Number of requests for which Vault returned an error | counter |
| `vault.agent.proxy.error` | Number of requests the agent failed to proxy | counter |
| `vault.agent.cache.hit` | Number of cache hits | counter |
| `vault.agent.cache.miss` | Number of cache misses | counter |
## Start Vault Agent
To run Vault Agent:
1. [Download](/downloads) the Vault binary where the client application runs
(virtual machine, Kubernetes pod, etc.)
(virtual machine, Kubernetes pod, etc.)
1. Create a Vault Agent configuration file. (See the [Example
Configuration](#example-configuration) section for an example configuration.)
Configuration](#example-configuration) section for an example configuration.)
1. Start a Vault Agent with the configuration file.
@ -317,3 +337,4 @@ template {
[listener]: /docs/agent#listener-stanza
[listener_main]: /docs/configuration/listener/tcp
[winsvc]: /docs/agent/winsvc
[telemetry]: /docs/configuration/telemetry