From 1ffdf04bd735d45e235ec7872def91acacfc730f Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Tue, 8 Nov 2016 14:45:12 -0500 Subject: [PATCH] Set MaxStale default to 10 years and add a stale counter (#2481) Default MaxStale to 10 years and add a counter at `consul.dns.stale_queries` that tracks when an agent serves a query that's stale by at least 5 seconds. Previously, MaxStale defaulted to 5 seconds and DNS would become unavailable after a short period of time with no leader. This new default allows DNS requests to still be served in the event of a long outage. Fixes #2460. --- command/agent/config.go | 2 +- command/agent/dns.go | 39 +++++++++++++------ .../source/docs/agent/options.html.markdown | 25 ++++++------ .../source/docs/agent/telemetry.html.markdown | 6 +++ 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/command/agent/config.go b/command/agent/config.go index bc0a1fa6f..36c00d5d2 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -677,7 +677,7 @@ func DefaultConfig() *Config { DNSConfig: DNSConfig{ AllowStale: Bool(true), UDPAnswerLimit: 3, - MaxStale: 5 * time.Second, + MaxStale: 10 * 365 * 24 * time.Hour, RecursorTimeout: 2 * time.Second, }, Telemetry: Telemetry{ diff --git a/command/agent/dns.go b/command/agent/dns.go index 417685a52..7f373cd41 100644 --- a/command/agent/dns.go +++ b/command/agent/dns.go @@ -24,6 +24,9 @@ const ( // times. maxUDPAnswerLimit = 8 maxRecurseRecords = 5 + + // Increment a counter when requests staler than this are served + staleCounterThreshold = 5 * time.Second ) // DNSServer is used to wrap an Agent and expose various @@ -437,10 +440,14 @@ RPC: } // Verify that request is not too stale, redo the request - if args.AllowStale && out.LastContact > d.config.MaxStale { - args.AllowStale = false - d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") - goto RPC + if args.AllowStale { + if out.LastContact > d.config.MaxStale { + args.AllowStale = false + d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") + goto RPC + } else if out.LastContact > staleCounterThreshold { + metrics.IncrCounter([]string{"consul", "dns", "stale_queries"}, 1) + } } // If we have no address, return not found! @@ -637,10 +644,14 @@ RPC: } // Verify that request is not too stale, redo the request - if args.AllowStale && out.LastContact > d.config.MaxStale { - args.AllowStale = false - d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") - goto RPC + if args.AllowStale { + if out.LastContact > d.config.MaxStale { + args.AllowStale = false + d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") + goto RPC + } else if out.LastContact > staleCounterThreshold { + metrics.IncrCounter([]string{"consul", "dns", "stale_queries"}, 1) + } } // Determine the TTL @@ -739,10 +750,14 @@ RPC: } // Verify that request is not too stale, redo the request. - if args.AllowStale && out.LastContact > d.config.MaxStale { - args.AllowStale = false - d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") - goto RPC + if args.AllowStale { + if out.LastContact > d.config.MaxStale { + args.AllowStale = false + d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") + goto RPC + } else if out.LastContact > staleCounterThreshold { + metrics.IncrCounter([]string{"consul", "dns", "stale_queries"}, 1) + } } // Determine the TTL. The parse should never fail since we vet it when diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index bedb8434c..d9715fc7b 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -536,39 +536,42 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass by the leader, providing stronger consistency but less throughput and higher latency. In Consul 0.7 and later, this defaults to true for better utilization of available servers. - * `max_stale` When [`allow_stale`](#allow_stale) - is specified, this is used to limit how - stale results are allowed to be. By default, this is set to "5s": - if a Consul server is more than 5 seconds behind the leader, the query will be - re-evaluated on the leader to get more up-to-date results. + * `max_stale` - When [`allow_stale`](#allow_stale) + is specified, this is used to limit how stale results are allowed to be. If a Consul server is + behind the leader by more than `max_stale`, the query will be re-evaluated on the leader to get + more up-to-date results. Prior to Consul 0.7.1 this defaulted to 5 seconds; in Consul 0.7.1 + and later this defaults to 10 years ("87600h") which effectively allows DNS queries to be answered + by any server, no matter how stale. In practice, servers are usually only milliseconds behind the + leader, so this lets Consul continue serving requests in long outage scenarios where no leader can + be elected. - * `node_ttl` By default, this is "0s", so all + * `node_ttl` - By default, this is "0s", so all node lookups are served with a 0 TTL value. DNS caching for node lookups can be enabled by setting this value. This should be specified with the "s" suffix for second or "m" for minute. - * `service_ttl` This is a sub-object + * `service_ttl` - This is a sub-object which allows for setting a TTL on service lookups with a per-service policy. The "*" wildcard service can be used when there is no specific policy available for a service. By default, all services are served with a 0 TTL value. DNS caching for service lookups can be enabled by setting this value. - * `enable_truncate` If set to + * `enable_truncate` - If set to true, a UDP DNS query that would return more than 3 records, or more than would fit into a valid UDP response, will set the truncated flag, indicating to clients that they should re-query using TCP to get the full set of records. - * `only_passing` If set to true, any + * `only_passing` - If set to true, any nodes whose health checks are warning or critical will be excluded from DNS results. If false, the default, only nodes whose healthchecks are failing as critical will be excluded. For service lookups, the health checks of the node itself, as well as the service-specific checks are considered. For example, if a node has a health check that is critical then all services on that node will be excluded because they are also considered critical. - * `recursor_timeout` Timeout used + * `recursor_timeout` - Timeout used by Consul when recursively querying an upstream DNS server. See `recursors` for more details. Default is 2s. This is available in Consul 0.7 and later. - * `disable_compression` If + * `disable_compression` - If set to true, DNS responses will not be compressed. Compression was added and enabled by default in Consul 0.7. diff --git a/website/source/docs/agent/telemetry.html.markdown b/website/source/docs/agent/telemetry.html.markdown index c662ae0d0..09e5f064b 100644 --- a/website/source/docs/agent/telemetry.html.markdown +++ b/website/source/docs/agent/telemetry.html.markdown @@ -177,6 +177,12 @@ These metrics give insight into the health of the cluster as a whole. ms timer + + `consul.dns.stale_queries` + Available in Consul 0.7.1 and later, this increments when an agent serves a DNS query based on information from a server that is more than 5 seconds out of date. + queries + counter + `consul.http..` This tracks how long it takes to service the given HTTP request for the given verb and path. Note that paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`)