fingerprint: don't clear Consul/Vault attributes on failure (#14673)
Clients periodically fingerprint Vault and Consul to ensure the server has updated attributes in the client's fingerprint. If the client can't reach Vault/Consul, the fingerprinter clears the attributes and requires a node update. Although this seems like correct behavior so that we can detect intentional removal of Vault/Consul access, it has two serious failure modes: (1) If a local Consul agent is restarted to pick up configuration changes and the client happens to fingerprint at that moment, the client will update its fingerprint and result in evaluations for all its jobs and all the system jobs in the cluster. (2) If a client loses Vault connectivity, the same thing happens. But the consequences are much worse in the Vault case because Vault is not run as a local agent, so Vault connectivity failures are highly correlated across the entire cluster. A 15 second Vault outage will cause a new `node-update` evalution for every system job on the cluster times the number of nodes, plus one `node-update` evaluation for every non-system job on each node. On large clusters of 1000s of nodes, we've seen this create a large backlog of evaluations. This changeset updates the fingerprinting behavior to keep the last fingerprint if Consul or Vault queries fail. This prevents a storm of evaluations at the cost of requiring a client restart if Consul or Vault is intentionally removed from the client.
This commit is contained in:
parent
0211240a03
commit
17aee4d69c
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
fingerprint: Consul and Vault attributes are no longer cleared on fingerprinting failure
|
||||
```
|
|
@ -75,14 +75,6 @@ func (f *ConsulFingerprint) Periodic() (bool, time.Duration) {
|
|||
return true, 15 * time.Second
|
||||
}
|
||||
|
||||
// clearConsulAttributes removes consul attributes and links from the passed Node.
|
||||
func (f *ConsulFingerprint) clearConsulAttributes(r *FingerprintResponse) {
|
||||
for attr := range f.extractors {
|
||||
r.RemoveAttribute(attr)
|
||||
}
|
||||
r.RemoveLink("consul")
|
||||
}
|
||||
|
||||
func (f *ConsulFingerprint) initialize(req *FingerprintRequest) error {
|
||||
// Only create the Consul client once to avoid creating many connections
|
||||
if f.client == nil {
|
||||
|
@ -118,8 +110,6 @@ func (f *ConsulFingerprint) query(resp *FingerprintResponse) agentconsul.Self {
|
|||
// If we can't hit this URL consul is probably not running on this machine.
|
||||
info, err := f.client.Agent().Self()
|
||||
if err != nil {
|
||||
f.clearConsulAttributes(resp)
|
||||
|
||||
// indicate consul no longer available
|
||||
if f.lastState == consulAvailable {
|
||||
f.logger.Info("consul agent is unavailable")
|
||||
|
|
|
@ -411,20 +411,9 @@ func TestConsulFingerprint_Fingerprint_oss(t *testing.T) {
|
|||
|
||||
// execute second query with error
|
||||
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
|
||||
require.NoError(t, err2) // does not return error
|
||||
require.Equal(t, map[string]string{ // attributes set empty
|
||||
"consul.datacenter": "",
|
||||
"consul.revision": "",
|
||||
"consul.segment": "",
|
||||
"consul.server": "",
|
||||
"consul.sku": "",
|
||||
"consul.version": "",
|
||||
"unique.consul.name": "",
|
||||
"consul.connect": "",
|
||||
"consul.grpc": "",
|
||||
"consul.ft.namespaces": "",
|
||||
}, resp2.Attributes)
|
||||
require.True(t, resp.Detected) // never downgrade
|
||||
require.NoError(t, err2) // does not return error
|
||||
require.Nil(t, resp2.Attributes) // attributes unset so they don't change
|
||||
require.True(t, resp.Detected) // never downgrade
|
||||
|
||||
// consul no longer available
|
||||
require.Equal(t, consulUnavailable, cf.lastState)
|
||||
|
@ -501,20 +490,9 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) {
|
|||
|
||||
// execute second query with error
|
||||
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
|
||||
require.NoError(t, err2) // does not return error
|
||||
require.Equal(t, map[string]string{ // attributes set empty
|
||||
"consul.datacenter": "",
|
||||
"consul.revision": "",
|
||||
"consul.segment": "",
|
||||
"consul.server": "",
|
||||
"consul.sku": "",
|
||||
"consul.version": "",
|
||||
"consul.ft.namespaces": "",
|
||||
"consul.connect": "",
|
||||
"consul.grpc": "",
|
||||
"unique.consul.name": "",
|
||||
}, resp2.Attributes)
|
||||
require.True(t, resp.Detected) // never downgrade
|
||||
require.NoError(t, err2) // does not return error
|
||||
require.Nil(t, resp2.Attributes) // attributes unset so they don't change
|
||||
require.True(t, resp.Detected) // never downgrade
|
||||
|
||||
// consul no longer available
|
||||
require.Equal(t, consulUnavailable, cf.lastState)
|
||||
|
|
|
@ -51,7 +51,7 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
|
|||
// Connect to vault and parse its information
|
||||
status, err := f.client.Sys().SealStatus()
|
||||
if err != nil {
|
||||
f.clearVaultAttributes(resp)
|
||||
|
||||
// Print a message indicating that Vault is not available anymore
|
||||
if f.lastState == vaultAvailable {
|
||||
f.logger.Info("Vault is unavailable")
|
||||
|
@ -80,10 +80,3 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin
|
|||
func (f *VaultFingerprint) Periodic() (bool, time.Duration) {
|
||||
return true, 15 * time.Second
|
||||
}
|
||||
|
||||
func (f *VaultFingerprint) clearVaultAttributes(r *FingerprintResponse) {
|
||||
r.RemoveAttribute("vault.accessible")
|
||||
r.RemoveAttribute("vault.version")
|
||||
r.RemoveAttribute("vault.cluster_id")
|
||||
r.RemoveAttribute("vault.cluster_name")
|
||||
}
|
||||
|
|
|
@ -39,4 +39,21 @@ func TestVaultFingerprint(t *testing.T) {
|
|||
assertNodeAttributeContains(t, response.Attributes, "vault.version")
|
||||
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
|
||||
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")
|
||||
|
||||
tv.Stop()
|
||||
|
||||
err = fp.Fingerprint(request, &response)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to fingerprint: %s", err)
|
||||
}
|
||||
|
||||
if !response.Detected {
|
||||
t.Fatalf("should still show as detected")
|
||||
}
|
||||
|
||||
assertNodeAttributeContains(t, response.Attributes, "vault.accessible")
|
||||
assertNodeAttributeContains(t, response.Attributes, "vault.version")
|
||||
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id")
|
||||
assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name")
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package testutil
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"os"
|
||||
|
@ -215,6 +216,9 @@ func (tv *TestVault) Stop() {
|
|||
}
|
||||
|
||||
if err := tv.cmd.Process.Kill(); err != nil {
|
||||
if errors.Is(err, os.ErrProcessDone) {
|
||||
return
|
||||
}
|
||||
tv.t.Errorf("err: %s", err)
|
||||
}
|
||||
if tv.waitCh != nil {
|
||||
|
|
|
@ -58,6 +58,12 @@ can use `nomad eval list -json` to get that list in JSON format. The `nomad eval
|
|||
status <eval ID>` command will format a specific evaluation in JSON format if
|
||||
the `-json` flag is provided.
|
||||
|
||||
#### Removing Vault/Consul from Clients
|
||||
|
||||
Nomad clients no longer have their Consul and Vault fingerprints cleared when
|
||||
connectivity is lost with Consul and Vault. To intentionally remove Consul and
|
||||
Vault from a client node, you will need to restart the client.
|
||||
|
||||
## Nomad 1.3.3
|
||||
|
||||
Environments that don't support the use of [`uid`][template_uid] and
|
||||
|
|
Loading…
Reference in New Issue