diff --git a/.changelog/10611.txt b/.changelog/10611.txt new file mode 100644 index 000000000..2c6c1e20a --- /dev/null +++ b/.changelog/10611.txt @@ -0,0 +1,3 @@ +```release-note:improvement +config: add `dns_config.recursor_strategy` flag to control the order which DNS recursors are queried +``` diff --git a/agent/config/builder.go b/agent/config/builder.go index 7c4d965a2..fae31bac6 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -908,6 +908,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { DNSNodeTTL: b.durationVal("dns_config.node_ttl", c.DNS.NodeTTL), DNSOnlyPassing: boolVal(c.DNS.OnlyPassing), DNSPort: dnsPort, + DNSRecursorStrategy: b.dnsRecursorStrategyVal(stringVal(c.DNS.RecursorStrategy)), DNSRecursorTimeout: b.durationVal("recursor_timeout", c.DNS.RecursorTimeout), DNSRecursors: dnsRecursors, DNSServiceTTL: dnsServiceTTL, @@ -1745,6 +1746,20 @@ func (b *builder) meshGatewayConfVal(mgConf *MeshGatewayConfig) structs.MeshGate return cfg } +func (b *builder) dnsRecursorStrategyVal(v string) dns.RecursorStrategy { + var out dns.RecursorStrategy + + switch dns.RecursorStrategy(v) { + case dns.RecursorStrategyRandom: + out = dns.RecursorStrategyRandom + case dns.RecursorStrategySequential, "": + out = dns.RecursorStrategySequential + default: + b.err = multierror.Append(b.err, fmt.Errorf("dns_config.recursor_strategy: invalid strategy: %q", v)) + } + return out +} + func (b *builder) exposeConfVal(v *ExposeConfig) structs.ExposeConfig { var out structs.ExposeConfig if v == nil { diff --git a/agent/config/config.go b/agent/config/config.go index 3dd6c7b45..6d511df04 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -634,6 +634,7 @@ type DNS struct { MaxStale *string `mapstructure:"max_stale"` NodeTTL *string `mapstructure:"node_ttl"` OnlyPassing *bool `mapstructure:"only_passing"` + RecursorStrategy *string `mapstructure:"recursor_strategy"` RecursorTimeout *string `mapstructure:"recursor_timeout"` ServiceTTL map[string]string `mapstructure:"service_ttl"` UDPAnswerLimit *int `mapstructure:"udp_answer_limit"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 65d3b0d4d..5a2857919 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/consul" + "github.com/hashicorp/consul/agent/dns" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/api" @@ -270,6 +271,15 @@ type RuntimeConfig struct { // hcl: dns_config { only_passing = (true|false) } DNSOnlyPassing bool + // DNSRecursorStrategy controls the order in which DNS recursors are queried. + // 'sequential' queries recursors in the order they are listed under `recursors`. + // 'random' causes random selection of recursors which has the effect of + // spreading the query load among all listed servers, rather than having + // client agents try the first server in the list every time. + // + // hcl: dns_config { recursor_strategy = "(random|sequential)" } + DNSRecursorStrategy dns.RecursorStrategy + // DNSRecursorTimeout specifies the timeout in seconds // for Consul's internal dns client used for recursion. // This value is used for the connection, read and write timeout. diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 869b67cbe..b4364983d 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -5425,6 +5425,7 @@ func TestLoad_FullConfig(t *testing.T) { DNSNodeTTL: 7084 * time.Second, DNSOnlyPassing: true, DNSPort: 7001, + DNSRecursorStrategy: "sequential", DNSRecursorTimeout: 4427 * time.Second, DNSRecursors: []string{"63.38.39.58", "92.49.18.18"}, DNSSOA: RuntimeSOAConfig{Refresh: 3600, Retry: 600, Expire: 86400, Minttl: 0}, diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index 903dada59..64ddefe51 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -147,6 +147,7 @@ "DNSNodeTTL": "0s", "DNSOnlyPassing": false, "DNSPort": 0, + "DNSRecursorStrategy": "", "DNSRecursorTimeout": "0s", "DNSRecursors": [], "DNSSOA": { diff --git a/agent/dns.go b/agent/dns.go index e14d289a8..992a0e467 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -70,22 +70,23 @@ type dnsSOAConfig struct { } type dnsConfig struct { - AllowStale bool - Datacenter string - EnableTruncate bool - MaxStale time.Duration - UseCache bool - CacheMaxAge time.Duration - NodeName string - NodeTTL time.Duration - OnlyPassing bool - RecursorTimeout time.Duration - Recursors []string - SegmentName string - UDPAnswerLimit int - ARecordLimit int - NodeMetaTXT bool - SOAConfig dnsSOAConfig + AllowStale bool + Datacenter string + EnableTruncate bool + MaxStale time.Duration + UseCache bool + CacheMaxAge time.Duration + NodeName string + NodeTTL time.Duration + OnlyPassing bool + RecursorStrategy agentdns.RecursorStrategy + RecursorTimeout time.Duration + Recursors []string + SegmentName string + UDPAnswerLimit int + ARecordLimit int + NodeMetaTXT bool + SOAConfig dnsSOAConfig // TTLRadix sets service TTLs by prefix, eg: "database-*" TTLRadix *radix.Tree // TTLStict sets TTLs to service by full name match. It Has higher priority than TTLRadix @@ -154,6 +155,7 @@ func GetDNSConfig(conf *config.RuntimeConfig) (*dnsConfig, error) { NodeName: conf.NodeName, NodeTTL: conf.DNSNodeTTL, OnlyPassing: conf.DNSOnlyPassing, + RecursorStrategy: conf.DNSRecursorStrategy, RecursorTimeout: conf.DNSRecursorTimeout, SegmentName: conf.SegmentName, UDPAnswerLimit: conf.DNSUDPAnswerLimit, @@ -1851,7 +1853,8 @@ func (d *DNSServer) handleRecurse(resp dns.ResponseWriter, req *dns.Msg) { var r *dns.Msg var rtt time.Duration var err error - for _, recursor := range cfg.Recursors { + for _, idx := range cfg.RecursorStrategy.Indexes(len(cfg.Recursors)) { + recursor := cfg.Recursors[idx] r, rtt, err = c.Exchange(req, recursor) // Check if the response is valid and has the desired Response code if r != nil && (r.Rcode != dns.RcodeSuccess && r.Rcode != dns.RcodeNameError) { @@ -1936,7 +1939,8 @@ func (d *DNSServer) resolveCNAME(cfg *dnsConfig, name string, maxRecursionLevel var r *dns.Msg var rtt time.Duration var err error - for _, recursor := range cfg.Recursors { + for _, idx := range cfg.RecursorStrategy.Indexes(len(cfg.Recursors)) { + recursor := cfg.Recursors[idx] r, rtt, err = c.Exchange(m, recursor) if err == nil { d.logger.Debug("cname recurse RTT for name", diff --git a/agent/dns/dns.go b/agent/dns/dns.go index 8744eb351..74d5773e8 100644 --- a/agent/dns/dns.go +++ b/agent/dns/dns.go @@ -1,6 +1,9 @@ package dns -import "regexp" +import ( + "math/rand" + "regexp" +) // MaxLabelLength is the maximum length for a name that can be used in DNS. const MaxLabelLength = 63 @@ -8,3 +11,24 @@ const MaxLabelLength = 63 // InvalidNameRe is a regex that matches characters which can not be included in // a DNS name. var InvalidNameRe = regexp.MustCompile(`[^A-Za-z0-9\\-]+`) + +type RecursorStrategy string + +const ( + RecursorStrategySequential RecursorStrategy = "sequential" + RecursorStrategyRandom RecursorStrategy = "random" +) + +func (s RecursorStrategy) Indexes(max int) []int { + switch s { + case RecursorStrategyRandom: + return rand.Perm(max) + default: + idxs := make([]int, max) + for i := range idxs { + idxs[i] = i + } + return idxs + + } +} diff --git a/agent/dns/dns_test.go b/agent/dns/dns_test.go new file mode 100644 index 000000000..d401bf93b --- /dev/null +++ b/agent/dns/dns_test.go @@ -0,0 +1,40 @@ +package dns + +import ( + "testing" + + "github.com/hashicorp/consul/sdk/testutil/retry" + "github.com/stretchr/testify/require" +) + +func TestDNS_Recursor_StrategyRandom(t *testing.T) { + configuredRecursors := []string{"1.1.1.1", "8.8.4.4", "8.8.8.8"} + recursorStrategy := RecursorStrategy("random") + + retry.RunWith(&retry.Counter{Count: 5}, t, func(r *retry.R) { + recursorsToQuery := make([]string, 0) + for _, idx := range recursorStrategy.Indexes(len(configuredRecursors)) { + recursorsToQuery = append(recursorsToQuery, configuredRecursors[idx]) + } + + // Ensure the slices contain the same elements + require.ElementsMatch(t, configuredRecursors, recursorsToQuery) + + // Ensure the elements are not in the same order + require.NotEqual(r, configuredRecursors, recursorsToQuery) + }) +} + +func TestDNS_Recursor_StrategySequential(t *testing.T) { + expectedRecursors := []string{"1.1.1.1", "8.8.4.4", "8.8.8.8"} + recursorStrategy := RecursorStrategy("sequential") + + recursorsToQuery := make([]string, 0) + for _, idx := range recursorStrategy.Indexes(len(expectedRecursors)) { + recursorsToQuery = append(recursorsToQuery, expectedRecursors[idx]) + } + + // The list of recursors should match the order in which they were defined + // in the configuration + require.Equal(t, recursorsToQuery, expectedRecursors) +} diff --git a/agent/dns_test.go b/agent/dns_test.go index 11448ab1b..60e5e1db2 100644 --- a/agent/dns_test.go +++ b/agent/dns_test.go @@ -7610,6 +7610,7 @@ func TestDNS_ConfigReload(t *testing.T) { } enable_truncate = false only_passing = false + recursor_strategy = "sequential" recursor_timeout = "15s" disable_compression = false a_record_limit = 1 @@ -7628,6 +7629,7 @@ func TestDNS_ConfigReload(t *testing.T) { for _, s := range a.dnsServers { cfg := s.config.Load().(*dnsConfig) require.Equal(t, []string{"8.8.8.8:53"}, cfg.Recursors) + require.Equal(t, agentdns.RecursorStrategy("sequential"), cfg.RecursorStrategy) require.False(t, cfg.AllowStale) require.Equal(t, 20*time.Second, cfg.MaxStale) require.Equal(t, 10*time.Second, cfg.NodeTTL) @@ -7658,6 +7660,7 @@ func TestDNS_ConfigReload(t *testing.T) { } newCfg.DNSEnableTruncate = true newCfg.DNSOnlyPassing = true + newCfg.DNSRecursorStrategy = "random" newCfg.DNSRecursorTimeout = 16 * time.Second newCfg.DNSDisableCompression = true newCfg.DNSARecordLimit = 2 @@ -7673,6 +7676,7 @@ func TestDNS_ConfigReload(t *testing.T) { for _, s := range a.dnsServers { cfg := s.config.Load().(*dnsConfig) require.Equal(t, []string{"1.1.1.1:53"}, cfg.Recursors) + require.Equal(t, agentdns.RecursorStrategy("random"), cfg.RecursorStrategy) require.True(t, cfg.AllowStale) require.Equal(t, 21*time.Second, cfg.MaxStale) require.Equal(t, 11*time.Second, cfg.NodeTTL) diff --git a/website/content/docs/agent/options.mdx b/website/content/docs/agent/options.mdx index ea57949a3..2ecbe11d2 100644 --- a/website/content/docs/agent/options.mdx +++ b/website/content/docs/agent/options.mdx @@ -1357,6 +1357,11 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr then all services on that node will be excluded because they are also considered critical. + - `recursor_strategy` - If set to `sequential`, Consul will query recursors in the + order listed in the [`recursors`](#recursors) option. If set to `random`, + Consul will query an upstream DNS resolvers in a random order. Defaults to + `sequential`. + - `recursor_timeout` - Timeout used by Consul when recursively querying an upstream DNS server. See [`recursors`](#recursors) for more details. Default is 2s. This is available in Consul 0.7 and later.