From d0f9e887f7bda6a7686c43c050e634e4401f9bad Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 17 Nov 2022 12:09:36 -0500 Subject: [PATCH] autopilot: include only servers from the same region (#15290) When we migrated to the updated autopilot library in Nomad 1.4.0, the interface for finding servers changed. Previously autopilot would get the serf members and call `IsServer` on each of them, leaving it up to the implementor to filter out clients (and in Nomad's case, other regions). But in the "new" autopilot library, the equivalent interface is `KnownServers` for which we did not filter by region. This causes spurious attempts for the cross-region stats fetching, which results in TLS errors and a lot of log noise. Filter the member set by region to fix the regression. --- .changelog/15290.txt | 3 +++ nomad/autopilot.go | 5 ++++- nomad/autopilot_test.go | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 .changelog/15290.txt diff --git a/.changelog/15290.txt b/.changelog/15290.txt new file mode 100644 index 000000000..a7405ffe4 --- /dev/null +++ b/.changelog/15290.txt @@ -0,0 +1,3 @@ +```release-note:bug +autopilot: Fixed a bug where autopilot would try to fetch raft stats from other regions +``` diff --git a/nomad/autopilot.go b/nomad/autopilot.go index c5a382ece..52a303e50 100644 --- a/nomad/autopilot.go +++ b/nomad/autopilot.go @@ -195,7 +195,7 @@ func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server { s.logger.Warn("Error parsing server info", "name", member.Name, "error", err) continue } else if srv == nil { - // this member was a client + // this member was a client or in another region continue } @@ -210,6 +210,9 @@ func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) { if !ok { return nil, nil } + if srv.Region != s.Region() { + return nil, nil + } return s.autopilotServerFromMetadata(srv) } diff --git a/nomad/autopilot_test.go b/nomad/autopilot_test.go index fb2afbe08..4b417d3db 100644 --- a/nomad/autopilot_test.go +++ b/nomad/autopilot_test.go @@ -190,6 +190,42 @@ func TestAutopilot_RollingUpdate(t *testing.T) { waitForStableLeadership(t, servers) } +func TestAutopilot_MultiRegion(t *testing.T) { + ci.Parallel(t) + + conf := func(c *Config) { + c.NumSchedulers = 0 // reduces test log noise + c.BootstrapExpect = 3 + } + s1, cleanupS1 := TestServer(t, conf) + defer cleanupS1() + + s2, cleanupS2 := TestServer(t, conf) + defer cleanupS2() + + s3, cleanupS3 := TestServer(t, conf) + defer cleanupS3() + + // federated regions should not be considered raft peers or show up in the + // known servers list + s4, cleanupS4 := TestServer(t, func(c *Config) { + c.BootstrapExpect = 0 + c.Region = "other" + }) + defer cleanupS4() + + servers := []*Server{s1, s2, s3} + TestJoin(t, s1, s2, s3, s4) + + t.Logf("waiting for initial stable cluster") + waitForStableLeadership(t, servers) + + apDelegate := &AutopilotDelegate{s3} + known := apDelegate.KnownServers() + must.Eq(t, 3, len(known)) + +} + func TestAutopilot_CleanupStaleRaftServer(t *testing.T) { ci.Parallel(t)