open-nomad/nomad/autopilot_test.go
Mahmood Ali 816a93ed4a tests: deflake TestAutopilot_RollingUpdate
I hypothesize that the flakiness in rolling update is due to shutting
down s3 server before s4 is properly added as a voter.

The chain of the flakiness is as follows:

1. Bootstrap with s1, s2, s3
2. Add s4
3. Wait for servers to register with 3 voting peers
   * But we already have 3 voters (s1, s2, and s3)
   * s4 is added as a non-voter in Raft v3 and must wait until autopilot promots it
4. Test proceeds without s4 being a voter
5. s3 shutdown
6. cluster changes stall due to leader election and too many pending configuration
changes (e.g. removing s3 from raft, promoting s4).

Here, I have the test wait until s4 is marked as a voter before shutting
down s3, so we don't have too many configuration changes at once.

In https://circleci.com/gh/hashicorp/nomad/57092, I noticed the
following events:

```
TestAutopilot_RollingUpdate: autopilot_test.go:204: adding server s4
    TestAutopilot_RollingUpdate: testlog.go:34: 2020-04-03T20:08:19.789Z [INFO]  nomad/serf.go:60: nomad: adding server: server="nomad-137.global (Addr: 127.0.0.1:9177) (DC: dc1)"
    TestAutopilot_RollingUpdate: testlog.go:34: 2020-04-03T20:08:19.789Z [INFO]  raft/raft.go:1018: nomad.raft: updating configuration: command=AddNonvoter server-id=c54b5bf4-1159-34f6-032d-56aefeb08425 server-addr=127.0.0.1:9177 servers="[{Suffrage:Voter ID:df01ba65-d1b2-17a9-f792-a4459b3a7c09 Address:127.0.0.1:9171} {Suffrage:Voter ID:c3337778-811e-2675-87f5-006309888387 Address:127.0.0.1:9173} {Suffrage:Voter ID:186d5e15-c473-e2b3-b5a4-3259a84e10ef Address:127.0.0.1:9169} {Suffrage:Nonvoter ID:c54b5bf4-1159-34f6-032d-56aefeb08425 Address:127.0.0.1:9177}]"

    TestAutopilot_RollingUpdate: autopilot_test.go:218: shutting down server s3
    TestAutopilot_RollingUpdate: testlog.go:34: 2020-04-03T20:08:19.797Z [INFO]  raft/replication.go:456: nomad.raft: aborting pipeline replication: peer="{Nonvoter c54b5bf4-1159-34f6-032d-56aefeb08425 127.0.0.1:9177}"
    TestAutopilot_RollingUpdate: autopilot_test.go:235: waiting for s4 to stabalize and be promoted
    TestAutopilot_RollingUpdate: testlog.go:34: 2020-04-03T20:08:19.975Z [ERROR] raft/raft.go:1656: nomad.raft: failed to make requestVote RPC: target="{Voter c3337778-811e-2675-87f5-006309888387 127.0.0.1:9173}" error="dial tcp 127.0.0.1:9173: connect: connection refused"
    TestAutopilot_RollingUpdate: retry.go:121: autopilot_test.go:241: don't want "c3337778-811e-2675-87f5-006309888387"
        autopilot_test.go:241: didn't find map[c54b5bf4-1159-34f6-032d-56aefeb08425:true] in []raft.ServerID{"df01ba65-d1b2-17a9-f792-a4459b3a7c09", "186d5e15-c473-e2b3-b5a4-3259a84e10ef"}
```

Note how s3, c3337778, is present in the peers list in the final
failure, but s4, c54b5bf4, is added as a Nonvoter and isn't present in
the final peers list.
2020-04-03 17:15:41 -04:00

362 lines
8 KiB
Go

package nomad
import (
"testing"
"time"
"fmt"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
)
// wantPeers determines whether the server has the given
// number of voting raft peers.
func wantPeers(s *Server, peers int) error {
future := s.raft.GetConfiguration()
if err := future.Error(); err != nil {
return err
}
n := autopilot.NumPeers(future.Configuration())
if got, want := n, peers; got != want {
return fmt.Errorf("got %d peers want %d", got, want)
}
return nil
}
// wantRaft determines if the servers have all of each other in their
// Raft configurations,
func wantRaft(servers []*Server) error {
// Make sure all the servers are represented in the Raft config,
// and that there are no extras.
verifyRaft := func(c raft.Configuration) error {
want := make(map[raft.ServerID]bool)
for _, s := range servers {
want[s.config.RaftConfig.LocalID] = true
}
found := make([]raft.ServerID, 0, len(c.Servers))
for _, s := range c.Servers {
found = append(found, s.ID)
if !want[s.ID] {
return fmt.Errorf("don't want %q", s.ID)
}
delete(want, s.ID)
}
if len(want) > 0 {
return fmt.Errorf("didn't find %v in %#+v", want, found)
}
return nil
}
for _, s := range servers {
future := s.raft.GetConfiguration()
if err := future.Error(); err != nil {
return err
}
if err := verifyRaft(future.Configuration()); err != nil {
return err
}
}
return nil
}
func TestAutopilot_CleanupDeadServer(t *testing.T) {
t.Parallel()
for i := 1; i <= 3; i++ {
testCleanupDeadServer(t, i)
}
}
func testCleanupDeadServer(t *testing.T, raftVersion int) {
conf := func(c *Config) {
c.BootstrapExpect = 3
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
}
s1, cleanupS1 := TestServer(t, conf)
defer cleanupS1()
s2, cleanupS2 := TestServer(t, conf)
defer cleanupS2()
s3, cleanupS3 := TestServer(t, conf)
defer cleanupS3()
servers := []*Server{s1, s2, s3}
// Try to join
TestJoin(t, s1, s2, s3)
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
// Bring up a new server
s4, cleanupS4 := TestServer(t, conf)
defer cleanupS4()
// Kill a non-leader server
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.Members() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 2 {
r.Fatal(nil)
}
})
// Join the new server
TestJoin(t, s1, s4)
servers[2] = s4
// Make sure the dead server is removed and we're back to 3 total peers
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
t.Parallel()
conf := func(c *Config) {
c.BootstrapExpect = 5
}
s1, cleanupS1 := TestServer(t, conf)
defer cleanupS1()
s2, cleanupS2 := TestServer(t, conf)
defer cleanupS2()
s3, cleanupS3 := TestServer(t, conf)
defer cleanupS3()
s4, cleanupS4 := TestServer(t, conf)
defer cleanupS4()
s5, cleanupS5 := TestServer(t, conf)
defer cleanupS5()
servers := []*Server{s1, s2, s3, s4, s5}
// Join the servers to s1, and wait until they are all promoted to
// voters.
TestJoin(t, s1, servers[1:]...)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 5))
}
})
// Kill a non-leader server
s4.Shutdown()
// Should be removed from the peers automatically
servers = []*Server{s1, s2, s3, s5}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 4))
}
})
}
func TestAutopilot_RollingUpdate(t *testing.T) {
t.Parallel()
conf := func(c *Config) {
c.BootstrapExpect = 3
c.RaftConfig.ProtocolVersion = 3
}
s1, cleanupS1 := TestServer(t, conf)
defer cleanupS1()
s2, cleanupS2 := TestServer(t, conf)
defer cleanupS2()
s3, cleanupS3 := TestServer(t, conf)
defer cleanupS3()
// Join the servers to s1, and wait until they are all promoted to
// voters.
servers := []*Server{s1, s2, s3}
TestJoin(t, s1, s2, s3)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
})
// Add one more server like we are doing a rolling update.
t.Logf("adding server s4")
s4, cleanupS4 := TestServer(t, conf)
defer cleanupS4()
TestJoin(t, s1, s4)
servers = append(servers, s4)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 4))
}
})
// Now kill one of the "old" nodes like we are doing a rolling update.
t.Logf("shutting down server s3")
s3.Shutdown()
isVoter := func() bool {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
for _, s := range future.Configuration().Servers {
if string(s.ID) == string(s4.config.NodeID) {
return s.Suffrage == raft.Voter
}
}
t.Fatalf("didn't find s4")
return false
}
t.Logf("waiting for s4 to stabalize and be promoted")
// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
// removed.
servers = []*Server{s1, s2, s4}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
if !isVoter() {
r.Fatalf("should be a voter")
}
})
}
func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
t.Skip("TestAutopilot_CleanupDeadServer is very flaky, removing it for now")
t.Parallel()
conf := func(c *Config) {
c.BootstrapExpect = 3
}
s1, cleanupS1 := TestServer(t, conf)
defer cleanupS1()
s2, cleanupS2 := TestServer(t, conf)
defer cleanupS2()
s3, cleanupS3 := TestServer(t, conf)
defer cleanupS3()
s4, cleanupS4 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 0
})
defer cleanupS4()
servers := []*Server{s1, s2, s3}
// Join the servers to s1
TestJoin(t, s1, s2, s3)
leader := waitForStableLeadership(t, servers)
// Add s4 to peers directly
addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port)
future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
if err := future.Error(); err != nil {
t.Fatal(err)
}
// Verify we have 4 peers
peers, err := s1.numPeers()
if err != nil {
t.Fatal(err)
}
if peers != 4 {
t.Fatalf("bad: %v", peers)
}
// Wait for s4 to be removed
for _, s := range []*Server{s1, s2, s3} {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
func TestAutopilot_PromoteNonVoter(t *testing.T) {
t.Parallel()
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.RaftConfig.ProtocolVersion = 3
})
defer cleanupS1()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC)
s2, cleanupS2 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 0
c.RaftConfig.ProtocolVersion = 3
})
defer cleanupS2()
TestJoin(t, s1, s2)
// Make sure we see it as a nonvoter initially. We wait until half
// the stabilization period has passed.
retry.Run(t, func(r *retry.R) {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
servers := future.Configuration().Servers
if len(servers) != 2 {
r.Fatalf("bad: %v", servers)
}
if servers[1].Suffrage != raft.Nonvoter {
r.Fatalf("bad: %v", servers)
}
health := s1.autopilot.GetServerHealth(string(servers[1].ID))
if health == nil {
r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth())
}
if !health.Healthy {
r.Fatalf("bad: %v", health)
}
if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
r.Fatal("stable period not elapsed")
}
})
// Make sure it ends up as a voter.
retry.Run(t, func(r *retry.R) {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
servers := future.Configuration().Servers
if len(servers) != 2 {
r.Fatalf("bad: %v", servers)
}
if servers[1].Suffrage != raft.Voter {
r.Fatalf("bad: %v", servers)
}
})
}