open-consul/agent/consul/autopilot_test.go
Daniel Nephin f3b63514d5 testing: Remove NotifyShutdown
NotifyShutdown was only used for testing. Now that t.Cleanup exists, we
can use that instead of attaching cleanup to the Server shutdown.

The Autopilot test which used NotifyShutdown doesn't need this
notification because Shutdown is synchronous. Waiting for the function
to return is equivalent.
2020-08-07 17:14:44 -04:00

488 lines
11 KiB
Go

package consul
import (
"os"
"testing"
"time"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)
func TestAutopilot_IdempotentShutdown(t *testing.T) {
dir1, s1 := testServerWithConfig(t, nil)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
retry.Run(t, func(r *retry.R) { r.Check(waitForLeader(s1)) })
s1.autopilot.Start()
s1.autopilot.Start()
s1.autopilot.Start()
s1.autopilot.Stop()
s1.autopilot.Stop()
s1.autopilot.Stop()
}
func TestAutopilot_CleanupDeadServer(t *testing.T) {
t.Parallel()
for i := 1; i <= 3; i++ {
testCleanupDeadServer(t, i)
}
}
func testCleanupDeadServer(t *testing.T, raftVersion int) {
dc := "dc1"
conf := func(c *Config) {
c.Datacenter = dc
c.Bootstrap = false
c.BootstrapExpect = 5
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
}
dir1, s1 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
dir5, s5 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir5)
defer s5.Shutdown()
servers := []*Server{s1, s2, s3, s4, s5}
// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)
joinLAN(t, s5, s1)
for _, s := range servers {
testrpc.WaitForLeader(t, s.RPC, dc)
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
}
require := require.New(t)
testrpc.WaitForLeader(t, s1.RPC, "dc1")
leaderIndex := -1
for i, s := range servers {
if s.IsLeader() {
leaderIndex = i
break
}
}
require.NotEqual(leaderIndex, -1)
// Shutdown two non-leader servers
killed := make(map[string]struct{})
for i, s := range servers {
if i != leaderIndex {
s.Shutdown()
killed[string(s.config.NodeID)] = struct{}{}
}
if len(killed) == 2 {
break
}
}
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range servers[leaderIndex].LANMembers() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 3 {
r.Fatalf("Expected three alive servers instead of %d", alive)
}
})
// Make sure the dead servers are removed and we're back to 3 total peers
for _, s := range servers {
_, killed := killed[string(s.config.NodeID)]
if !killed {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
}
func TestAutopilot_CleanupDeadNonvoter(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
testrpc.WaitForLeader(t, s1.RPC, "dc1")
// Have s2 join and then shut it down immediately before it gets a chance to
// be promoted to a voter.
joinLAN(t, s2, s1)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft([]*Server{s1, s2}))
})
s2.Shutdown()
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft([]*Server{s1}))
})
}
func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
t.Parallel()
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = true
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
conf := func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = false
}
dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
dir5, s5 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir5)
defer s5.Shutdown()
servers := []*Server{s1, s2, s3, s4, s5}
// Join the servers to s1, and wait until they are all promoted to
// voters.
for _, s := range servers[1:] {
joinLAN(t, s, s1)
}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 5))
}
})
// Kill a non-leader server
s4.Shutdown()
// Should be removed from the peers automatically
servers = []*Server{s1, s2, s3, s5}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 4))
}
})
}
func TestAutopilot_RollingUpdate(t *testing.T) {
t.Parallel()
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = true
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
conf := func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = false
}
dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
// Join the servers to s1, and wait until they are all promoted to
// voters.
servers := []*Server{s1, s2, s3}
for _, s := range servers[1:] {
joinLAN(t, s, s1)
}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
})
// Add one more server like we are doing a rolling update.
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
joinLAN(t, s1, s4)
servers = append(servers, s4)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
})
// Now kill one of the "old" nodes like we are doing a rolling update.
s3.Shutdown()
isVoter := func() bool {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
for _, s := range future.Configuration().Servers {
if string(s.ID) == string(s4.config.NodeID) {
return s.Suffrage == raft.Voter
}
}
t.Fatalf("didn't find s4")
return false
}
// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
// removed.
servers = []*Server{s1, s2, s4}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
if !isVoter() {
r.Fatalf("should be a voter")
}
})
}
func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
t.Parallel()
dir1, s1 := testServerDCBootstrap(t, "dc1", true)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
dir4, s4 := testServerDCBootstrap(t, "dc1", false)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
servers := []*Server{s1, s2, s3}
// Join the servers to s1
for _, s := range servers[1:] {
joinLAN(t, s, s1)
}
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
testrpc.WaitForLeader(t, s1.RPC, "dc1")
// Add s4 to peers directly
addVoterFuture := s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(joinAddrLAN(s4)), 0, 0)
if err := addVoterFuture.Error(); err != nil {
t.Fatal(err)
}
// Verify we have 4 peers
peers, err := s1.numPeers()
if err != nil {
t.Fatal(err)
}
if peers != 4 {
t.Fatalf("bad: %v", peers)
}
// Wait for s4 to be removed
for _, s := range []*Server{s1, s2, s3} {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
func TestAutopilot_PromoteNonVoter(t *testing.T) {
t.Parallel()
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = true
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.ServerStabilizationTime = 200 * time.Millisecond
c.ServerHealthInterval = 100 * time.Millisecond
c.AutopilotInterval = 100 * time.Millisecond
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testrpc.WaitForLeader(t, s1.RPC, "dc1")
dir2, s2 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = false
c.RaftConfig.ProtocolVersion = 3
})
defer os.RemoveAll(dir2)
defer s2.Shutdown()
joinLAN(t, s2, s1)
// Make sure we see it as a nonvoter initially. We wait until half
// the stabilization period has passed.
retry.Run(t, func(r *retry.R) {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
servers := future.Configuration().Servers
if len(servers) != 2 {
r.Fatalf("bad: %v", servers)
}
if servers[1].Suffrage != raft.Nonvoter {
r.Fatalf("bad: %v", servers)
}
health := s1.autopilot.GetServerHealth(string(servers[1].ID))
if health == nil {
r.Fatal("nil health")
}
if !health.Healthy {
r.Fatalf("bad: %v", health)
}
if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
r.Fatal("stable period not elapsed")
}
})
// Make sure it ends up as a voter.
retry.Run(t, func(r *retry.R) {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
servers := future.Configuration().Servers
if len(servers) != 2 {
r.Fatalf("bad: %v", servers)
}
if servers[1].Suffrage != raft.Voter {
r.Fatalf("bad: %v", servers)
}
})
}
func TestAutopilot_MinQuorum(t *testing.T) {
dc := "dc1"
conf := func(c *Config) {
c.Datacenter = dc
c.Bootstrap = false
c.BootstrapExpect = 4
c.AutopilotConfig.MinQuorum = 3
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(2)
c.AutopilotInterval = 100 * time.Millisecond
}
dir1, s1 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
servers := map[string]*Server{s1.config.NodeName: s1,
s2.config.NodeName: s2,
s3.config.NodeName: s3,
s4.config.NodeName: s4}
// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)
//Differentiate between leader and server
findStatus := func(leader bool) *Server {
for _, mem := range servers {
if mem.IsLeader() == leader {
return mem
}
if !mem.IsLeader() == !leader {
return mem
}
}
return nil
}
testrpc.WaitForLeader(t, s1.RPC, dc)
// Have autopilot take one into left
dead := findStatus(false)
if dead == nil {
t.Fatalf("no members set")
}
require.NoError(t, dead.Shutdown())
retry.Run(t, func(r *retry.R) {
leader := findStatus(true)
if leader == nil {
r.Fatalf("no members set")
}
for _, m := range leader.LANMembers() {
if m.Name == dead.config.NodeName && m.Status != serf.StatusLeft {
r.Fatalf("%v should be left, got %v", m.Name, m.Status.String())
}
}
})
delete(servers, dead.config.NodeName)
//Autopilot should not take this one into left
dead = findStatus(false)
require.NoError(t, dead.Shutdown())
retry.Run(t, func(r *retry.R) {
leader := findStatus(true)
if leader == nil {
r.Fatalf("no members set")
}
for _, m := range leader.LANMembers() {
if m.Name == dead.config.NodeName && m.Status != serf.StatusFailed {
r.Fatalf("%v should be failed, got %v", m.Name, m.Status.String())
}
}
})
}