autopilot: include only servers from the same region (#15290)

When we migrated to the updated autopilot library in Nomad 1.4.0, the interface
for finding servers changed. Previously autopilot would get the serf members and
call `IsServer` on each of them, leaving it up to the implementor to filter out
clients (and in Nomad's case, other regions). But in the "new" autopilot
library, the equivalent interface is `KnownServers` for which we did not filter
by region. This causes spurious attempts for the cross-region stats fetching,
which results in TLS errors and a lot of log noise.

Filter the member set by region to fix the regression.
This commit is contained in:
Tim Gross 2022-11-17 12:09:36 -05:00 committed by GitHub
parent 510eb435dc
commit d0f9e887f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 1 deletions

3
.changelog/15290.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
autopilot: Fixed a bug where autopilot would try to fetch raft stats from other regions
```

View File

@ -195,7 +195,7 @@ func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
continue
} else if srv == nil {
// this member was a client
// this member was a client or in another region
continue
}
@ -210,6 +210,9 @@ func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
if !ok {
return nil, nil
}
if srv.Region != s.Region() {
return nil, nil
}
return s.autopilotServerFromMetadata(srv)
}

View File

@ -190,6 +190,42 @@ func TestAutopilot_RollingUpdate(t *testing.T) {
waitForStableLeadership(t, servers)
}
func TestAutopilot_MultiRegion(t *testing.T) {
ci.Parallel(t)
conf := func(c *Config) {
c.NumSchedulers = 0 // reduces test log noise
c.BootstrapExpect = 3
}
s1, cleanupS1 := TestServer(t, conf)
defer cleanupS1()
s2, cleanupS2 := TestServer(t, conf)
defer cleanupS2()
s3, cleanupS3 := TestServer(t, conf)
defer cleanupS3()
// federated regions should not be considered raft peers or show up in the
// known servers list
s4, cleanupS4 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 0
c.Region = "other"
})
defer cleanupS4()
servers := []*Server{s1, s2, s3}
TestJoin(t, s1, s2, s3, s4)
t.Logf("waiting for initial stable cluster")
waitForStableLeadership(t, servers)
apDelegate := &AutopilotDelegate{s3}
known := apDelegate.KnownServers()
must.Eq(t, 3, len(known))
}
func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
ci.Parallel(t)