From 17aee4d69c004b5ad9f93816cba2a10c47605c27 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 23 Sep 2022 14:45:12 -0400 Subject: [PATCH] fingerprint: don't clear Consul/Vault attributes on failure (#14673) Clients periodically fingerprint Vault and Consul to ensure the server has updated attributes in the client's fingerprint. If the client can't reach Vault/Consul, the fingerprinter clears the attributes and requires a node update. Although this seems like correct behavior so that we can detect intentional removal of Vault/Consul access, it has two serious failure modes: (1) If a local Consul agent is restarted to pick up configuration changes and the client happens to fingerprint at that moment, the client will update its fingerprint and result in evaluations for all its jobs and all the system jobs in the cluster. (2) If a client loses Vault connectivity, the same thing happens. But the consequences are much worse in the Vault case because Vault is not run as a local agent, so Vault connectivity failures are highly correlated across the entire cluster. A 15 second Vault outage will cause a new `node-update` evalution for every system job on the cluster times the number of nodes, plus one `node-update` evaluation for every non-system job on each node. On large clusters of 1000s of nodes, we've seen this create a large backlog of evaluations. This changeset updates the fingerprinting behavior to keep the last fingerprint if Consul or Vault queries fail. This prevents a storm of evaluations at the cost of requiring a client restart if Consul or Vault is intentionally removed from the client. --- .changelog/14673.txt | 3 ++ client/fingerprint/consul.go | 10 ------ client/fingerprint/consul_test.go | 34 ++++--------------- client/fingerprint/vault.go | 9 +---- client/fingerprint/vault_test.go | 17 ++++++++++ testutil/vault.go | 4 +++ .../content/docs/upgrade/upgrade-specific.mdx | 6 ++++ 7 files changed, 37 insertions(+), 46 deletions(-) create mode 100644 .changelog/14673.txt diff --git a/.changelog/14673.txt b/.changelog/14673.txt new file mode 100644 index 000000000..e934a0b08 --- /dev/null +++ b/.changelog/14673.txt @@ -0,0 +1,3 @@ +```release-note:improvement +fingerprint: Consul and Vault attributes are no longer cleared on fingerprinting failure +``` diff --git a/client/fingerprint/consul.go b/client/fingerprint/consul.go index f56267233..e1b2b449d 100644 --- a/client/fingerprint/consul.go +++ b/client/fingerprint/consul.go @@ -75,14 +75,6 @@ func (f *ConsulFingerprint) Periodic() (bool, time.Duration) { return true, 15 * time.Second } -// clearConsulAttributes removes consul attributes and links from the passed Node. -func (f *ConsulFingerprint) clearConsulAttributes(r *FingerprintResponse) { - for attr := range f.extractors { - r.RemoveAttribute(attr) - } - r.RemoveLink("consul") -} - func (f *ConsulFingerprint) initialize(req *FingerprintRequest) error { // Only create the Consul client once to avoid creating many connections if f.client == nil { @@ -118,8 +110,6 @@ func (f *ConsulFingerprint) query(resp *FingerprintResponse) agentconsul.Self { // If we can't hit this URL consul is probably not running on this machine. info, err := f.client.Agent().Self() if err != nil { - f.clearConsulAttributes(resp) - // indicate consul no longer available if f.lastState == consulAvailable { f.logger.Info("consul agent is unavailable") diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go index 4b3887478..b8b322fb0 100644 --- a/client/fingerprint/consul_test.go +++ b/client/fingerprint/consul_test.go @@ -411,20 +411,9 @@ func TestConsulFingerprint_Fingerprint_oss(t *testing.T) { // execute second query with error err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2) - require.NoError(t, err2) // does not return error - require.Equal(t, map[string]string{ // attributes set empty - "consul.datacenter": "", - "consul.revision": "", - "consul.segment": "", - "consul.server": "", - "consul.sku": "", - "consul.version": "", - "unique.consul.name": "", - "consul.connect": "", - "consul.grpc": "", - "consul.ft.namespaces": "", - }, resp2.Attributes) - require.True(t, resp.Detected) // never downgrade + require.NoError(t, err2) // does not return error + require.Nil(t, resp2.Attributes) // attributes unset so they don't change + require.True(t, resp.Detected) // never downgrade // consul no longer available require.Equal(t, consulUnavailable, cf.lastState) @@ -501,20 +490,9 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) { // execute second query with error err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2) - require.NoError(t, err2) // does not return error - require.Equal(t, map[string]string{ // attributes set empty - "consul.datacenter": "", - "consul.revision": "", - "consul.segment": "", - "consul.server": "", - "consul.sku": "", - "consul.version": "", - "consul.ft.namespaces": "", - "consul.connect": "", - "consul.grpc": "", - "unique.consul.name": "", - }, resp2.Attributes) - require.True(t, resp.Detected) // never downgrade + require.NoError(t, err2) // does not return error + require.Nil(t, resp2.Attributes) // attributes unset so they don't change + require.True(t, resp.Detected) // never downgrade // consul no longer available require.Equal(t, consulUnavailable, cf.lastState) diff --git a/client/fingerprint/vault.go b/client/fingerprint/vault.go index 3ef933d9e..5639e861c 100644 --- a/client/fingerprint/vault.go +++ b/client/fingerprint/vault.go @@ -51,7 +51,7 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin // Connect to vault and parse its information status, err := f.client.Sys().SealStatus() if err != nil { - f.clearVaultAttributes(resp) + // Print a message indicating that Vault is not available anymore if f.lastState == vaultAvailable { f.logger.Info("Vault is unavailable") @@ -80,10 +80,3 @@ func (f *VaultFingerprint) Fingerprint(req *FingerprintRequest, resp *Fingerprin func (f *VaultFingerprint) Periodic() (bool, time.Duration) { return true, 15 * time.Second } - -func (f *VaultFingerprint) clearVaultAttributes(r *FingerprintResponse) { - r.RemoveAttribute("vault.accessible") - r.RemoveAttribute("vault.version") - r.RemoveAttribute("vault.cluster_id") - r.RemoveAttribute("vault.cluster_name") -} diff --git a/client/fingerprint/vault_test.go b/client/fingerprint/vault_test.go index 2891d5057..c56ff9138 100644 --- a/client/fingerprint/vault_test.go +++ b/client/fingerprint/vault_test.go @@ -39,4 +39,21 @@ func TestVaultFingerprint(t *testing.T) { assertNodeAttributeContains(t, response.Attributes, "vault.version") assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id") assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name") + + tv.Stop() + + err = fp.Fingerprint(request, &response) + if err != nil { + t.Fatalf("Failed to fingerprint: %s", err) + } + + if !response.Detected { + t.Fatalf("should still show as detected") + } + + assertNodeAttributeContains(t, response.Attributes, "vault.accessible") + assertNodeAttributeContains(t, response.Attributes, "vault.version") + assertNodeAttributeContains(t, response.Attributes, "vault.cluster_id") + assertNodeAttributeContains(t, response.Attributes, "vault.cluster_name") + } diff --git a/testutil/vault.go b/testutil/vault.go index 4c5aa05f4..5cd4f81dd 100644 --- a/testutil/vault.go +++ b/testutil/vault.go @@ -1,6 +1,7 @@ package testutil import ( + "errors" "fmt" "math/rand" "os" @@ -215,6 +216,9 @@ func (tv *TestVault) Stop() { } if err := tv.cmd.Process.Kill(); err != nil { + if errors.Is(err, os.ErrProcessDone) { + return + } tv.t.Errorf("err: %s", err) } if tv.waitCh != nil { diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index fa804db66..b9eb22281 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -58,6 +58,12 @@ can use `nomad eval list -json` to get that list in JSON format. The `nomad eval status ` command will format a specific evaluation in JSON format if the `-json` flag is provided. +#### Removing Vault/Consul from Clients + +Nomad clients no longer have their Consul and Vault fingerprints cleared when +connectivity is lost with Consul and Vault. To intentionally remove Consul and +Vault from a client node, you will need to restart the client. + ## Nomad 1.3.3 Environments that don't support the use of [`uid`][template_uid] and