Set MaxStale default to 10 years and add a stale counter (#2481)
Default MaxStale to 10 years and add a counter at `consul.dns.stale_queries` that tracks when an agent serves a query that's stale by at least 5 seconds. Previously, MaxStale defaulted to 5 seconds and DNS would become unavailable after a short period of time with no leader. This new default allows DNS requests to still be served in the event of a long outage. Fixes #2460.
This commit is contained in:
parent
a629920e25
commit
1ffdf04bd7
|
@ -677,7 +677,7 @@ func DefaultConfig() *Config {
|
||||||
DNSConfig: DNSConfig{
|
DNSConfig: DNSConfig{
|
||||||
AllowStale: Bool(true),
|
AllowStale: Bool(true),
|
||||||
UDPAnswerLimit: 3,
|
UDPAnswerLimit: 3,
|
||||||
MaxStale: 5 * time.Second,
|
MaxStale: 10 * 365 * 24 * time.Hour,
|
||||||
RecursorTimeout: 2 * time.Second,
|
RecursorTimeout: 2 * time.Second,
|
||||||
},
|
},
|
||||||
Telemetry: Telemetry{
|
Telemetry: Telemetry{
|
||||||
|
|
|
@ -24,6 +24,9 @@ const (
|
||||||
// times.
|
// times.
|
||||||
maxUDPAnswerLimit = 8
|
maxUDPAnswerLimit = 8
|
||||||
maxRecurseRecords = 5
|
maxRecurseRecords = 5
|
||||||
|
|
||||||
|
// Increment a counter when requests staler than this are served
|
||||||
|
staleCounterThreshold = 5 * time.Second
|
||||||
)
|
)
|
||||||
|
|
||||||
// DNSServer is used to wrap an Agent and expose various
|
// DNSServer is used to wrap an Agent and expose various
|
||||||
|
@ -437,10 +440,14 @@ RPC:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify that request is not too stale, redo the request
|
// Verify that request is not too stale, redo the request
|
||||||
if args.AllowStale && out.LastContact > d.config.MaxStale {
|
if args.AllowStale {
|
||||||
args.AllowStale = false
|
if out.LastContact > d.config.MaxStale {
|
||||||
d.logger.Printf("[WARN] dns: Query results too stale, re-requesting")
|
args.AllowStale = false
|
||||||
goto RPC
|
d.logger.Printf("[WARN] dns: Query results too stale, re-requesting")
|
||||||
|
goto RPC
|
||||||
|
} else if out.LastContact > staleCounterThreshold {
|
||||||
|
metrics.IncrCounter([]string{"consul", "dns", "stale_queries"}, 1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we have no address, return not found!
|
// If we have no address, return not found!
|
||||||
|
@ -637,10 +644,14 @@ RPC:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify that request is not too stale, redo the request
|
// Verify that request is not too stale, redo the request
|
||||||
if args.AllowStale && out.LastContact > d.config.MaxStale {
|
if args.AllowStale {
|
||||||
args.AllowStale = false
|
if out.LastContact > d.config.MaxStale {
|
||||||
d.logger.Printf("[WARN] dns: Query results too stale, re-requesting")
|
args.AllowStale = false
|
||||||
goto RPC
|
d.logger.Printf("[WARN] dns: Query results too stale, re-requesting")
|
||||||
|
goto RPC
|
||||||
|
} else if out.LastContact > staleCounterThreshold {
|
||||||
|
metrics.IncrCounter([]string{"consul", "dns", "stale_queries"}, 1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine the TTL
|
// Determine the TTL
|
||||||
|
@ -739,10 +750,14 @@ RPC:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify that request is not too stale, redo the request.
|
// Verify that request is not too stale, redo the request.
|
||||||
if args.AllowStale && out.LastContact > d.config.MaxStale {
|
if args.AllowStale {
|
||||||
args.AllowStale = false
|
if out.LastContact > d.config.MaxStale {
|
||||||
d.logger.Printf("[WARN] dns: Query results too stale, re-requesting")
|
args.AllowStale = false
|
||||||
goto RPC
|
d.logger.Printf("[WARN] dns: Query results too stale, re-requesting")
|
||||||
|
goto RPC
|
||||||
|
} else if out.LastContact > staleCounterThreshold {
|
||||||
|
metrics.IncrCounter([]string{"consul", "dns", "stale_queries"}, 1)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine the TTL. The parse should never fail since we vet it when
|
// Determine the TTL. The parse should never fail since we vet it when
|
||||||
|
|
|
@ -536,39 +536,42 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
|
||||||
by the leader, providing stronger consistency but less throughput and higher latency. In Consul
|
by the leader, providing stronger consistency but less throughput and higher latency. In Consul
|
||||||
0.7 and later, this defaults to true for better utilization of available servers.
|
0.7 and later, this defaults to true for better utilization of available servers.
|
||||||
|
|
||||||
* <a name="max_stale"></a><a href="#max_stale">`max_stale`</a> When [`allow_stale`](#allow_stale)
|
* <a name="max_stale"></a><a href="#max_stale">`max_stale`</a> - When [`allow_stale`](#allow_stale)
|
||||||
is specified, this is used to limit how
|
is specified, this is used to limit how stale results are allowed to be. If a Consul server is
|
||||||
stale results are allowed to be. By default, this is set to "5s":
|
behind the leader by more than `max_stale`, the query will be re-evaluated on the leader to get
|
||||||
if a Consul server is more than 5 seconds behind the leader, the query will be
|
more up-to-date results. Prior to Consul 0.7.1 this defaulted to 5 seconds; in Consul 0.7.1
|
||||||
re-evaluated on the leader to get more up-to-date results.
|
and later this defaults to 10 years ("87600h") which effectively allows DNS queries to be answered
|
||||||
|
by any server, no matter how stale. In practice, servers are usually only milliseconds behind the
|
||||||
|
leader, so this lets Consul continue serving requests in long outage scenarios where no leader can
|
||||||
|
be elected.
|
||||||
|
|
||||||
* <a name="node_ttl"></a><a href="#node_ttl">`node_ttl`</a> By default, this is "0s", so all
|
* <a name="node_ttl"></a><a href="#node_ttl">`node_ttl`</a> - By default, this is "0s", so all
|
||||||
node lookups are served with a 0 TTL value. DNS caching for node lookups can be enabled by
|
node lookups are served with a 0 TTL value. DNS caching for node lookups can be enabled by
|
||||||
setting this value. This should be specified with the "s" suffix for second or "m" for minute.
|
setting this value. This should be specified with the "s" suffix for second or "m" for minute.
|
||||||
|
|
||||||
* <a name="service_ttl"></a><a href="#service_ttl">`service_ttl`</a> This is a sub-object
|
* <a name="service_ttl"></a><a href="#service_ttl">`service_ttl`</a> - This is a sub-object
|
||||||
which allows for setting a TTL on service lookups with a per-service policy. The "*" wildcard
|
which allows for setting a TTL on service lookups with a per-service policy. The "*" wildcard
|
||||||
service can be used when there is no specific policy available for a service. By default, all
|
service can be used when there is no specific policy available for a service. By default, all
|
||||||
services are served with a 0 TTL value. DNS caching for service lookups can be enabled by
|
services are served with a 0 TTL value. DNS caching for service lookups can be enabled by
|
||||||
setting this value.
|
setting this value.
|
||||||
|
|
||||||
* <a name="enable_truncate"></a><a href="#enable_truncate">`enable_truncate`</a> If set to
|
* <a name="enable_truncate"></a><a href="#enable_truncate">`enable_truncate`</a> - If set to
|
||||||
true, a UDP DNS query that would return more than 3 records, or more than would fit into a valid
|
true, a UDP DNS query that would return more than 3 records, or more than would fit into a valid
|
||||||
UDP response, will set the truncated flag, indicating to clients that they should re-query
|
UDP response, will set the truncated flag, indicating to clients that they should re-query
|
||||||
using TCP to get the full set of records.
|
using TCP to get the full set of records.
|
||||||
|
|
||||||
* <a name="only_passing"></a><a href="#only_passing">`only_passing`</a> If set to true, any
|
* <a name="only_passing"></a><a href="#only_passing">`only_passing`</a> - If set to true, any
|
||||||
nodes whose health checks are warning or critical will be excluded from DNS results. If false,
|
nodes whose health checks are warning or critical will be excluded from DNS results. If false,
|
||||||
the default, only nodes whose healthchecks are failing as critical will be excluded. For
|
the default, only nodes whose healthchecks are failing as critical will be excluded. For
|
||||||
service lookups, the health checks of the node itself, as well as the service-specific checks
|
service lookups, the health checks of the node itself, as well as the service-specific checks
|
||||||
are considered. For example, if a node has a health check that is critical then all services on
|
are considered. For example, if a node has a health check that is critical then all services on
|
||||||
that node will be excluded because they are also considered critical.
|
that node will be excluded because they are also considered critical.
|
||||||
|
|
||||||
* <a name="recursor_timeout"></a><a href="#recursor_timeout">`recursor_timeout`</a> Timeout used
|
* <a name="recursor_timeout"></a><a href="#recursor_timeout">`recursor_timeout`</a> - Timeout used
|
||||||
by Consul when recursively querying an upstream DNS server. See <a href="#recursors">`recursors`</a>
|
by Consul when recursively querying an upstream DNS server. See <a href="#recursors">`recursors`</a>
|
||||||
for more details. Default is 2s. This is available in Consul 0.7 and later.
|
for more details. Default is 2s. This is available in Consul 0.7 and later.
|
||||||
|
|
||||||
* <a name="disable_compression"></a><a href="#disable_compression">`disable_compression`</a> If
|
* <a name="disable_compression"></a><a href="#disable_compression">`disable_compression`</a> - If
|
||||||
set to true, DNS responses will not be compressed. Compression was added and enabled by default
|
set to true, DNS responses will not be compressed. Compression was added and enabled by default
|
||||||
in Consul 0.7.
|
in Consul 0.7.
|
||||||
|
|
||||||
|
|
|
@ -177,6 +177,12 @@ These metrics give insight into the health of the cluster as a whole.
|
||||||
<td>ms</td>
|
<td>ms</td>
|
||||||
<td>timer</td>
|
<td>timer</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>`consul.dns.stale_queries`</td>
|
||||||
|
<td>Available in Consul 0.7.1 and later, this increments when an agent serves a DNS query based on information from a server that is more than 5 seconds out of date.</td>
|
||||||
|
<td>queries</td>
|
||||||
|
<td>counter</td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>`consul.http.<verb>.<path>`</td>
|
<td>`consul.http.<verb>.<path>`</td>
|
||||||
<td>This tracks how long it takes to service the given HTTP request for the given verb and path. Note that paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`)</td>
|
<td>This tracks how long it takes to service the given HTTP request for the given verb and path. Note that paths do not include details like service or key names, for these an underscore will be present as a placeholder (eg. `consul.http.GET.v1.kv._`)</td>
|
||||||
|
|
Loading…
Reference in a new issue