Add DR Metric scraping capability to debug command (#15316)

* Add server information as well as ability to collect metrics from DR secondary

* Update debug docs

Adding additional information around ability to gather metrics from DR secondary

* Fix broken link in updated doc

* Create 15316.txt

Create changelog entry

* Fix Formatting

* Update website/content/docs/commands/debug.mdx

Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>

* Update changelog/15316.txt

Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>

* Trigger Build

Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>
This commit is contained in:
davidadeleon 2022-05-06 16:04:08 -04:00 committed by GitHub
parent 4be45db85b
commit 9e869c52fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 16 deletions

3
changelog/15316.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
cli/debug: added support for retrieving metrics from DR clusters if `unauthenticated_metrics_access` is enabled
```

View File

@ -58,6 +58,7 @@ type debugIndex struct {
Version int `json:"version"` Version int `json:"version"`
VaultAddress string `json:"vault_address"` VaultAddress string `json:"vault_address"`
ClientVersion string `json:"client_version"` ClientVersion string `json:"client_version"`
ServerVersion string `json:"server_version"`
Timestamp time.Time `json:"timestamp"` Timestamp time.Time `json:"timestamp"`
DurationSeconds int `json:"duration_seconds"` DurationSeconds int `json:"duration_seconds"`
IntervalSeconds int `json:"interval_seconds"` IntervalSeconds int `json:"interval_seconds"`
@ -245,6 +246,7 @@ func (c *DebugCommand) Run(args []string) int {
c.UI.Output("==> Starting debug capture...") c.UI.Output("==> Starting debug capture...")
c.UI.Info(fmt.Sprintf(" Vault Address: %s", c.debugIndex.VaultAddress)) c.UI.Info(fmt.Sprintf(" Vault Address: %s", c.debugIndex.VaultAddress))
c.UI.Info(fmt.Sprintf(" Client Version: %s", c.debugIndex.ClientVersion)) c.UI.Info(fmt.Sprintf(" Client Version: %s", c.debugIndex.ClientVersion))
c.UI.Info(fmt.Sprintf(" Server Version: %s", c.debugIndex.ServerVersion))
c.UI.Info(fmt.Sprintf(" Duration: %s", c.flagDuration)) c.UI.Info(fmt.Sprintf(" Duration: %s", c.flagDuration))
c.UI.Info(fmt.Sprintf(" Interval: %s", c.flagInterval)) c.UI.Info(fmt.Sprintf(" Interval: %s", c.flagInterval))
c.UI.Info(fmt.Sprintf(" Metrics Interval: %s", c.flagMetricsInterval)) c.UI.Info(fmt.Sprintf(" Metrics Interval: %s", c.flagMetricsInterval))
@ -412,9 +414,20 @@ func (c *DebugCommand) preflight(rawArgs []string) (string, error) {
if err != nil { if err != nil {
return "", fmt.Errorf("unable to create client to connect to Vault: %s", err) return "", fmt.Errorf("unable to create client to connect to Vault: %s", err)
} }
if _, err := client.Sys().Health(); err != nil { serverHealth, err := client.Sys().Health()
if err != nil {
return "", fmt.Errorf("unable to connect to the server: %s", err) return "", fmt.Errorf("unable to connect to the server: %s", err)
} }
// Check if server is DR Secondary and we need to further
// ignore any targets due to endpoint restrictions
if serverHealth.ReplicationDRMode == "secondary" {
invalidDRTargets := strutil.Difference(c.flagTargets, c.validDRSecondaryTargets(), true)
if len(invalidDRTargets) != 0 {
c.UI.Info(fmt.Sprintf("Ignoring invalid targets for DR Secondary: %s", strings.Join(invalidDRTargets, ", ")))
c.flagTargets = strutil.Difference(c.flagTargets, invalidDRTargets, true)
}
}
c.cachedClient = client c.cachedClient = client
captureTime := time.Now().UTC() captureTime := time.Now().UTC()
@ -469,6 +482,7 @@ func (c *DebugCommand) preflight(rawArgs []string) (string, error) {
c.debugIndex = &debugIndex{ c.debugIndex = &debugIndex{
VaultAddress: client.Address(), VaultAddress: client.Address(),
ClientVersion: version.GetVersion().VersionNumber(), ClientVersion: version.GetVersion().VersionNumber(),
ServerVersion: serverHealth.Version,
Compress: c.flagCompress, Compress: c.flagCompress,
DurationSeconds: int(c.flagDuration.Seconds()), DurationSeconds: int(c.flagDuration.Seconds()),
IntervalSeconds: int(c.flagInterval.Seconds()), IntervalSeconds: int(c.flagInterval.Seconds()),
@ -487,6 +501,10 @@ func (c *DebugCommand) defaultTargets() []string {
return []string{"config", "host", "requests", "metrics", "pprof", "replication-status", "server-status", "log"} return []string{"config", "host", "requests", "metrics", "pprof", "replication-status", "server-status", "log"}
} }
func (c *DebugCommand) validDRSecondaryTargets() []string {
return []string{"metrics", "replication-status", "server-status"}
}
func (c *DebugCommand) captureStaticTargets() error { func (c *DebugCommand) captureStaticTargets() error {
// Capture configuration state // Capture configuration state
if strutil.StrListContains(c.flagTargets, "config") { if strutil.StrListContains(c.flagTargets, "config") {
@ -686,21 +704,6 @@ func (c *DebugCommand) collectMetrics(ctx context.Context) {
c.logger.Info("capturing metrics", "count", idxCount) c.logger.Info("capturing metrics", "count", idxCount)
idxCount++ idxCount++
healthStatus, err := c.cachedClient.Sys().Health()
if err != nil {
c.captureError("metrics", err)
continue
}
// Check replication status. We skip on processing metrics if we're one
// a DR node, though non-perf standbys will fail if they aren't using
// unauthenticated_metrics_access.
switch {
case healthStatus.ReplicationDRMode == "secondary":
c.logger.Info("skipping metrics capture on DR secondary node")
continue
}
// Perform metrics request // Perform metrics request
r := c.cachedClient.NewRequest("GET", "/v1/sys/metrics") r := c.cachedClient.NewRequest("GET", "/v1/sys/metrics")
resp, err := c.cachedClient.RawRequestWithContext(ctx, r) resp, err := c.cachedClient.RawRequestWithContext(ctx, r)

View File

@ -57,6 +57,9 @@ pertains to the local node and the request should not be forwarded.
Additionally, host information is not available on the OpenBSD platform due to Additionally, host information is not available on the OpenBSD platform due to
library limitations in fetching the data without enabling `cgo`. library limitations in fetching the data without enabling `cgo`.
[Enterprise] Telemetry can be gathered from a DR Secondary active node via the
`metrics` target if [unauthenticated_metrics_access](/docs/configuration/listener/tcp#unauthenticated_metrics_access) is enabled.
## Output Layout ## Output Layout
The output of the bundled information, once decompressed, is contained within a The output of the bundled information, once decompressed, is contained within a