open-nomad/nomad/autopilot_test.go

package nomad

import (
	"testing"
	"time"

	"fmt"

	"github.com/hashicorp/consul/agent/consul/autopilot"
	"github.com/hashicorp/consul/sdk/testutil/retry"
	"github.com/hashicorp/nomad/testutil"
	"github.com/hashicorp/raft"
	"github.com/hashicorp/serf/serf"
)

// wantPeers determines whether the server has the given
// number of voting raft peers.
func wantPeers(s *Server, peers int) error {
	future := s.raft.GetConfiguration()
	if err := future.Error(); err != nil {
		return err
	}

	n := autopilot.NumPeers(future.Configuration())
	if got, want := n, peers; got != want {
		return fmt.Errorf("server %v: got %d peers want %d\n\tservers: %#+v", s.config.NodeName, got, want, future.Configuration().Servers)
	}
	return nil
}

// wantRaft determines if the servers have all of each other in their
// Raft configurations,
func wantRaft(servers []*Server) error {
	// Make sure all the servers are represented in the Raft config,
	// and that there are no extras.
	verifyRaft := func(c raft.Configuration) error {
		want := make(map[raft.ServerID]bool)
		for _, s := range servers {
			want[s.config.RaftConfig.LocalID] = true
		}

		found := make([]raft.ServerID, 0, len(c.Servers))
		for _, s := range c.Servers {
			found = append(found, s.ID)
			if !want[s.ID] {
				return fmt.Errorf("don't want %q", s.ID)
			}
			delete(want, s.ID)
		}

		if len(want) > 0 {
			return fmt.Errorf("didn't find %v in %#+v", want, found)
		}
		return nil
	}

	for _, s := range servers {
		future := s.raft.GetConfiguration()
		if err := future.Error(); err != nil {
			return err
		}
		if err := verifyRaft(future.Configuration()); err != nil {
			return err
		}
	}
	return nil
}

func TestAutopilot_CleanupDeadServer(t *testing.T) {
	t.Parallel()
	t.Run("raft_v2", func(t *testing.T) { testCleanupDeadServer(t, 2) })
	t.Run("raft_v3", func(t *testing.T) { testCleanupDeadServer(t, 3) })
}

func testCleanupDeadServer(t *testing.T, raftVersion int) {
	conf := func(c *Config) {
		c.BootstrapExpect = 3
		c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
	}

	s1, cleanupS1 := TestServer(t, conf)
	defer cleanupS1()

	s2, cleanupS2 := TestServer(t, conf)
	defer cleanupS2()

	s3, cleanupS3 := TestServer(t, conf)
	defer cleanupS3()

	servers := []*Server{s1, s2, s3}

	// Try to join
	TestJoin(t, servers...)

	for _, s := range servers {
		testutil.WaitForLeader(t, s.RPC)
		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
	}

	// Bring up a new server
	s4, cleanupS4 := TestServer(t, conf)
	defer cleanupS4()

	// Kill a non-leader server
	killedIdx := 0
	for i, s := range servers {
		if !s.IsLeader() {
			killedIdx = i
			s.Shutdown()
			break
		}
	}

	retry.Run(t, func(r *retry.R) {
		for i, s := range servers {
			alive := 0
			if i == killedIdx {
				// Skip shutdown server
				continue
			}
			for _, m := range s.Members() {
				if m.Status == serf.StatusAlive {
					alive++
				}
			}

			if alive != 2 {
				r.Fatalf("expected 2 alive servers but found %v", alive)
			}
		}
	})

	// Join the new server
	servers[killedIdx] = s4
	TestJoin(t, servers...)

	waitForStableLeadership(t, servers)

	// Make sure the dead server is removed and we're back to 3 total peers
	for _, s := range servers {
		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
	}
}

func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
	t.Parallel()

	conf := func(c *Config) {
		c.BootstrapExpect = 5
	}

	s1, cleanupS1 := TestServer(t, conf)
	defer cleanupS1()

	s2, cleanupS2 := TestServer(t, conf)
	defer cleanupS2()

	s3, cleanupS3 := TestServer(t, conf)
	defer cleanupS3()

	s4, cleanupS4 := TestServer(t, conf)
	defer cleanupS4()

	s5, cleanupS5 := TestServer(t, conf)
	defer cleanupS5()

	servers := []*Server{s1, s2, s3, s4, s5}

	// Join the servers to s1, and wait until they are all promoted to
	// voters.
	TestJoin(t, servers...)
	retry.Run(t, func(r *retry.R) {
		r.Check(wantRaft(servers))
		for _, s := range servers {
			r.Check(wantPeers(s, 5))
		}
	})

	// Kill a non-leader server
	if leader := waitForStableLeadership(t, servers); leader == s4 {
		s1, s4 = s4, s1
	}
	s4.Shutdown()

	// Should be removed from the peers automatically
	servers = []*Server{s1, s2, s3, s5}
	retry.Run(t, func(r *retry.R) {
		r.Check(wantRaft(servers))
		for _, s := range servers {
			r.Check(wantPeers(s, 4))
		}
	})
}

func TestAutopilot_RollingUpdate(t *testing.T) {
	t.Parallel()

	conf := func(c *Config) {
		c.BootstrapExpect = 3
		c.RaftConfig.ProtocolVersion = 3
	}

	s1, cleanupS1 := TestServer(t, conf)
	defer cleanupS1()

	s2, cleanupS2 := TestServer(t, conf)
	defer cleanupS2()

	s3, cleanupS3 := TestServer(t, conf)
	defer cleanupS3()

	// Join the servers to s1, and wait until they are all promoted to
	// voters.
	servers := []*Server{s1, s2, s3}
	TestJoin(t, s1, s2, s3)
	retry.Run(t, func(r *retry.R) {
		r.Check(wantRaft(servers))
		for _, s := range servers {
			r.Check(wantPeers(s, 3))
		}
	})

	// Add one more server like we are doing a rolling update.
	t.Logf("adding server s4")
	s4, cleanupS4 := TestServer(t, conf)
	defer cleanupS4()
	TestJoin(t, s1, s4)

	servers = append(servers, s4)
	retry.Run(t, func(r *retry.R) {
		r.Check(wantRaft(servers))
		for _, s := range servers {
			r.Check(wantPeers(s, 4))
		}
	})

	// Now kill one of the "old" nodes like we are doing a rolling update.
	t.Logf("shutting down server s3")
	s3.Shutdown()

	isVoter := func() bool {
		future := s1.raft.GetConfiguration()
		if err := future.Error(); err != nil {
			t.Fatalf("err: %v", err)
		}
		for _, s := range future.Configuration().Servers {
			if string(s.ID) == string(s4.config.NodeID) {
				return s.Suffrage == raft.Voter
			}
		}
		t.Fatalf("didn't find s4")
		return false
	}

	t.Logf("waiting for s4 to stabalize and be promoted")

	// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
	// removed.
	servers = []*Server{s1, s2, s4}
	retry.Run(t, func(r *retry.R) {
		r.Check(wantRaft(servers))
		for _, s := range servers {
			r.Check(wantPeers(s, 3))
		}
		if !isVoter() {
			r.Fatalf("should be a voter")
		}
	})
}

func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
	t.Skip("TestAutopilot_CleanupDeadServer is very flaky, removing it for now")
	t.Parallel()

	conf := func(c *Config) {
		c.BootstrapExpect = 3
	}
	s1, cleanupS1 := TestServer(t, conf)
	defer cleanupS1()

	s2, cleanupS2 := TestServer(t, conf)
	defer cleanupS2()

	s3, cleanupS3 := TestServer(t, conf)
	defer cleanupS3()

	s4, cleanupS4 := TestServer(t, func(c *Config) {
		c.BootstrapExpect = 0
	})
	defer cleanupS4()

	servers := []*Server{s1, s2, s3}

	// Join the servers to s1
	TestJoin(t, s1, s2, s3)

	leader := waitForStableLeadership(t, servers)

	// Add s4 to peers directly
	addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port)
	future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
	if err := future.Error(); err != nil {
		t.Fatal(err)
	}

	// Verify we have 4 peers
	peers, err := s1.numPeers()
	if err != nil {
		t.Fatal(err)
	}
	if peers != 4 {
		t.Fatalf("bad: %v", peers)
	}

	// Wait for s4 to be removed
	for _, s := range []*Server{s1, s2, s3} {
		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
	}
}

func TestAutopilot_PromoteNonVoter(t *testing.T) {
	t.Parallel()

	s1, cleanupS1 := TestServer(t, func(c *Config) {
		c.RaftConfig.ProtocolVersion = 3
	})
	defer cleanupS1()
	codec := rpcClient(t, s1)
	defer codec.Close()
	testutil.WaitForLeader(t, s1.RPC)

	s2, cleanupS2 := TestServer(t, func(c *Config) {
		c.BootstrapExpect = 0
		c.RaftConfig.ProtocolVersion = 3
	})
	defer cleanupS2()
	TestJoin(t, s1, s2)

	// Make sure we see it as a nonvoter initially. We wait until half
	// the stabilization period has passed.
	retry.Run(t, func(r *retry.R) {
		future := s1.raft.GetConfiguration()
		if err := future.Error(); err != nil {
			r.Fatal(err)
		}

		servers := future.Configuration().Servers
		if len(servers) != 2 {
			r.Fatalf("bad: %v", servers)
		}
		if servers[1].Suffrage != raft.Nonvoter {
			r.Fatalf("bad: %v", servers)
		}
		health := s1.autopilot.GetServerHealth(string(servers[1].ID))
		if health == nil {
			r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth())
		}
		if !health.Healthy {
			r.Fatalf("bad: %v", health)
		}
		if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
			r.Fatal("stable period not elapsed")
		}
	})

	// Make sure it ends up as a voter.
	retry.Run(t, func(r *retry.R) {
		future := s1.raft.GetConfiguration()
		if err := future.Error(); err != nil {
			r.Fatal(err)
		}

		servers := future.Configuration().Servers
		if len(servers) != 2 {
			r.Fatalf("bad: %v", servers)
		}
		if servers[1].Suffrage != raft.Voter {
			r.Fatalf("bad: %v", servers)
		}
	})
}