fingerprint: don't clear Consul/Vault attributes on failure (#14673)

Clients periodically fingerprint Vault and Consul to ensure the server has
updated attributes in the client's fingerprint. If the client can't reach
Vault/Consul, the fingerprinter clears the attributes and requires a node
update. Although this seems like correct behavior so that we can detect
intentional removal of Vault/Consul access, it has two serious failure modes:

(1) If a local Consul agent is restarted to pick up configuration changes and the
client happens to fingerprint at that moment, the client will update its
fingerprint and result in evaluations for all its jobs and all the system jobs
in the cluster.

(2) If a client loses Vault connectivity, the same thing happens. But the
consequences are much worse in the Vault case because Vault is not run as a
local agent, so Vault connectivity failures are highly correlated across the
entire cluster. A 15 second Vault outage will cause a new `node-update`
evalution for every system job on the cluster times the number of nodes, plus
one `node-update` evaluation for every non-system job on each node. On large
clusters of 1000s of nodes, we've seen this create a large backlog of evaluations.

This changeset updates the fingerprinting behavior to keep the last fingerprint
if Consul or Vault queries fail. This prevents a storm of evaluations at the
cost of requiring a client restart if Consul or Vault is intentionally removed
from the client.
This commit is contained in:
Tim Gross 2022-09-23 14:45:12 -04:00 committed by GitHub
parent 0211240a03
commit 17aee4d69c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 37 additions and 46 deletions

3
.changelog/14673.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
fingerprint: Consul and Vault attributes are no longer cleared on fingerprinting failure
```

View File

@ -75,14 +75,6 @@ func (f *ConsulFingerprint) Periodic() (bool, time.Duration) {
return true, 15 * time.Second
}
// clearConsulAttributes removes consul attributes and links from the passed Node.
func (f *ConsulFingerprint) clearConsulAttributes(r *FingerprintResponse) {
for attr := range f.extractors {
r.RemoveAttribute(attr)
}
r.RemoveLink("consul")
}
func (f *ConsulFingerprint) initialize(req *FingerprintRequest) error {
// Only create the Consul client once to avoid creating many connections
if f.client == nil {
@ -118,8 +110,6 @@ func (f *ConsulFingerprint) query(resp *FingerprintResponse) agentconsul.Self {
// If we can't hit this URL consul is probably not running on this machine.
info, err := f.client.Agent().Self()
if err != nil {
f.clearConsulAttributes(resp)
// indicate consul no longer available
if f.lastState == consulAvailable {
f.logger.Info("consul agent is unavailable")

View File

@ -412,18 +412,7 @@ func TestConsulFingerprint_Fingerprint_oss(t *testing.T) {
// execute second query with error
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
require.NoError(t, err2) // does not return error
require.Equal(t, map[string]string{ // attributes set empty
"consul.datacenter": "",
"consul.revision": "",
"consul.segment": "",
"consul.server": "",
"consul.sku": "",
"consul.version": "",
"unique.consul.name": "",
"consul.connect": "",
"consul.grpc": "",
"consul.ft.namespaces": "",
}, resp2.Attributes)
require.Nil(t, resp2.Attributes) // attributes unset so they don't change
require.True(t, resp.Detected) // never downgrade
// consul no longer available
@ -502,18 +491,7 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) {
// execute second query with error
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
require.NoError(t, err2) // does not return error
require.Equal(t, map[string]string{ // attributes set empty
"consul.datacenter": "",
"consul.revision": "",
"consul.segment": "",
"consul.server": "",
"consul.sku": "",
"consul.version": "",
"consul.ft.namespaces": "",
"consul.connect": "",
"consul.grpc": "",
"unique.consul.name": "",
}, resp2.Attributes)
require.Nil(t, resp2.Attributes) // attributes unset so they don't change
require.True(t, resp.Detected) // never downgrade
// consul no longer available

View File

@ -51,7 +51,7 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
// Connect to vault and parse its information
status, err := f.client.Sys().SealStatus()
if err != nil {
f.clearVaultAttributes(resp)
// Print a message indicating that Vault is not available anymore
if f.lastState == vaultAvailable {
f.logger.Info("Vault is unavailable")
@ -80,10 +80,3 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
func (f *VaultFingerprint) Periodic() (bool, time.Duration) {
return true, 15 * time.Second
}
func (f *VaultFingerprint) clearVaultAttributes(r *FingerprintResponse) {
r.RemoveAttribute("vault.accessible")
r.RemoveAttribute("vault.version")
r.RemoveAttribute("vault.cluster_id")
r.RemoveAttribute("vault.cluster_name")
}

View File

@ -39,4 +39,21 @@ func TestVaultFingerprint(t *testing.T) {
assertNodeAttributeContains(t, response.Attributes, "vault.version")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")
tv.Stop()
err = fp.Fingerprint(request, &response)
if err != nil {
t.Fatalf("Failed to fingerprint: %s", err)
}
if !response.Detected {
t.Fatalf("should still show as detected")
}
assertNodeAttributeContains(t, response.Attributes, "vault.accessible")
assertNodeAttributeContains(t, response.Attributes, "vault.version")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")
}

View File

@ -1,6 +1,7 @@
package testutil
import (
"errors"
"fmt"
"math/rand"
"os"
@ -215,6 +216,9 @@ func (tv *TestVault) Stop() {
}
if err := tv.cmd.Process.Kill(); err != nil {
if errors.Is(err, os.ErrProcessDone) {
return
}
tv.t.Errorf("err: %s", err)
}
if tv.waitCh != nil {

View File

@ -58,6 +58,12 @@ can use `nomad eval list -json` to get that list in JSON format. The `nomad eval
status <eval ID>` command will format a specific evaluation in JSON format if
the `-json` flag is provided.
#### Removing Vault/Consul from Clients
Nomad clients no longer have their Consul and Vault fingerprints cleared when
connectivity is lost with Consul and Vault. To intentionally remove Consul and
Vault from a client node, you will need to restart the client.
## Nomad 1.3.3
Environments that don't support the use of [`uid`][template_uid] and