2017-03-01 22:04:40 +00:00
|
|
|
package consul
|
|
|
|
|
|
|
|
import (
|
2020-09-25 17:46:38 +00:00
|
|
|
"context"
|
2022-04-19 17:03:03 +00:00
|
|
|
"fmt"
|
2017-03-01 22:04:40 +00:00
|
|
|
"os"
|
|
|
|
"testing"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/hashicorp/raft"
|
|
|
|
"github.com/hashicorp/serf/serf"
|
2019-12-16 22:35:13 +00:00
|
|
|
"github.com/stretchr/testify/require"
|
2021-12-10 00:08:40 +00:00
|
|
|
|
2022-04-19 17:03:03 +00:00
|
|
|
"github.com/hashicorp/consul/agent/consul/autopilotevents"
|
|
|
|
"github.com/hashicorp/consul/agent/consul/stream"
|
2021-12-10 00:08:40 +00:00
|
|
|
"github.com/hashicorp/consul/agent/structs"
|
|
|
|
"github.com/hashicorp/consul/sdk/testutil/retry"
|
|
|
|
"github.com/hashicorp/consul/testrpc"
|
2017-03-01 22:04:40 +00:00
|
|
|
)
|
|
|
|
|
2018-02-21 18:19:30 +00:00
|
|
|
func TestAutopilot_IdempotentShutdown(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2021-12-10 00:08:40 +00:00
|
|
|
dir1, s1 := testServerWithConfig(t)
|
2018-02-21 18:19:30 +00:00
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
retry.Run(t, func(r *retry.R) { r.Check(waitForLeader(s1)) })
|
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
s1.autopilot.Start(context.Background())
|
|
|
|
s1.autopilot.Start(context.Background())
|
|
|
|
s1.autopilot.Start(context.Background())
|
|
|
|
<-s1.autopilot.Stop()
|
|
|
|
<-s1.autopilot.Stop()
|
|
|
|
<-s1.autopilot.Stop()
|
2018-02-21 18:19:30 +00:00
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
func TestAutopilot_CleanupDeadServer(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2018-08-06 23:46:09 +00:00
|
|
|
dc := "dc1"
|
2017-03-15 23:09:55 +00:00
|
|
|
conf := func(c *Config) {
|
2018-08-06 23:46:09 +00:00
|
|
|
c.Datacenter = dc
|
2017-03-15 23:09:55 +00:00
|
|
|
c.Bootstrap = false
|
2019-12-16 22:35:13 +00:00
|
|
|
c.BootstrapExpect = 5
|
2017-03-15 23:09:55 +00:00
|
|
|
}
|
|
|
|
dir1, s1 := testServerWithConfig(t, conf)
|
2017-03-01 22:04:40 +00:00
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
|
2017-03-15 23:09:55 +00:00
|
|
|
dir2, s2 := testServerWithConfig(t, conf)
|
2017-03-01 22:04:40 +00:00
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
|
|
|
|
2017-03-15 23:09:55 +00:00
|
|
|
dir3, s3 := testServerWithConfig(t, conf)
|
2017-03-01 22:04:40 +00:00
|
|
|
defer os.RemoveAll(dir3)
|
|
|
|
defer s3.Shutdown()
|
|
|
|
|
2019-12-16 22:35:13 +00:00
|
|
|
dir4, s4 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir4)
|
|
|
|
defer s4.Shutdown()
|
|
|
|
|
|
|
|
dir5, s5 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir5)
|
|
|
|
defer s5.Shutdown()
|
|
|
|
|
|
|
|
servers := []*Server{s1, s2, s3, s4, s5}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
|
|
|
// Try to join
|
2017-05-05 10:29:49 +00:00
|
|
|
joinLAN(t, s2, s1)
|
|
|
|
joinLAN(t, s3, s1)
|
2019-12-16 22:35:13 +00:00
|
|
|
joinLAN(t, s4, s1)
|
|
|
|
joinLAN(t, s5, s1)
|
2017-03-01 22:04:40 +00:00
|
|
|
|
|
|
|
for _, s := range servers {
|
2018-08-06 23:46:09 +00:00
|
|
|
testrpc.WaitForLeader(t, s.RPC, dc)
|
2019-12-16 22:35:13 +00:00
|
|
|
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2019-12-16 22:35:13 +00:00
|
|
|
testrpc.WaitForLeader(t, s1.RPC, "dc1")
|
|
|
|
leaderIndex := -1
|
|
|
|
for i, s := range servers {
|
|
|
|
if s.IsLeader() {
|
|
|
|
leaderIndex = i
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
bulk rewrite using this script
set -euo pipefail
unset CDPATH
cd "$(dirname "$0")"
for f in $(git grep '\brequire := require\.New(' | cut -d':' -f1 | sort -u); do
echo "=== require: $f ==="
sed -i '/require := require.New(t)/d' $f
# require.XXX(blah) but not require.XXX(tblah) or require.XXX(rblah)
sed -i 's/\brequire\.\([a-zA-Z0-9_]*\)(\([^tr]\)/require.\1(t,\2/g' $f
# require.XXX(tblah) but not require.XXX(t, blah)
sed -i 's/\brequire\.\([a-zA-Z0-9_]*\)(\(t[^,]\)/require.\1(t,\2/g' $f
# require.XXX(rblah) but not require.XXX(r, blah)
sed -i 's/\brequire\.\([a-zA-Z0-9_]*\)(\(r[^,]\)/require.\1(t,\2/g' $f
gofmt -s -w $f
done
for f in $(git grep '\bassert := assert\.New(' | cut -d':' -f1 | sort -u); do
echo "=== assert: $f ==="
sed -i '/assert := assert.New(t)/d' $f
# assert.XXX(blah) but not assert.XXX(tblah) or assert.XXX(rblah)
sed -i 's/\bassert\.\([a-zA-Z0-9_]*\)(\([^tr]\)/assert.\1(t,\2/g' $f
# assert.XXX(tblah) but not assert.XXX(t, blah)
sed -i 's/\bassert\.\([a-zA-Z0-9_]*\)(\(t[^,]\)/assert.\1(t,\2/g' $f
# assert.XXX(rblah) but not assert.XXX(r, blah)
sed -i 's/\bassert\.\([a-zA-Z0-9_]*\)(\(r[^,]\)/assert.\1(t,\2/g' $f
gofmt -s -w $f
done
2022-01-20 16:46:23 +00:00
|
|
|
require.NotEqual(t, leaderIndex, -1)
|
2019-12-16 22:35:13 +00:00
|
|
|
|
|
|
|
// Shutdown two non-leader servers
|
|
|
|
killed := make(map[string]struct{})
|
|
|
|
for i, s := range servers {
|
|
|
|
if i != leaderIndex {
|
|
|
|
s.Shutdown()
|
|
|
|
killed[string(s.config.NodeID)] = struct{}{}
|
|
|
|
}
|
|
|
|
if len(killed) == 2 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2017-03-15 23:09:55 +00:00
|
|
|
|
2017-05-04 22:52:53 +00:00
|
|
|
retry.Run(t, func(r *retry.R) {
|
2017-03-01 22:04:40 +00:00
|
|
|
alive := 0
|
2021-10-26 20:08:55 +00:00
|
|
|
for _, m := range servers[leaderIndex].LANMembersInAgentPartition() {
|
2017-03-01 22:04:40 +00:00
|
|
|
if m.Status == serf.StatusAlive {
|
|
|
|
alive++
|
|
|
|
}
|
|
|
|
}
|
2019-12-16 22:35:13 +00:00
|
|
|
if alive != 3 {
|
|
|
|
r.Fatalf("Expected three alive servers instead of %d", alive)
|
2017-04-29 16:34:02 +00:00
|
|
|
}
|
|
|
|
})
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2019-12-16 22:35:13 +00:00
|
|
|
// Make sure the dead servers are removed and we're back to 3 total peers
|
2017-03-01 22:04:40 +00:00
|
|
|
for _, s := range servers {
|
2019-12-16 22:35:13 +00:00
|
|
|
_, killed := killed[string(s.config.NodeID)]
|
|
|
|
if !killed {
|
|
|
|
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-14 21:24:51 +00:00
|
|
|
func TestAutopilot_CleanupDeadNonvoter(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
|
|
|
c.AutopilotConfig = &structs.AutopilotConfig{
|
|
|
|
CleanupDeadServers: true,
|
|
|
|
ServerStabilizationTime: 100 * time.Millisecond,
|
|
|
|
}
|
|
|
|
})
|
2018-08-14 21:24:51 +00:00
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
// we have to wait for autopilot to be running long enough for the server stabilization time
|
|
|
|
// to kick in for this test to work.
|
|
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
|
2018-08-14 21:24:51 +00:00
|
|
|
dir2, s2 := testServerDCBootstrap(t, "dc1", false)
|
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
|
|
|
|
|
|
|
testrpc.WaitForLeader(t, s1.RPC, "dc1")
|
|
|
|
|
|
|
|
// Have s2 join and then shut it down immediately before it gets a chance to
|
|
|
|
// be promoted to a voter.
|
|
|
|
joinLAN(t, s2, s1)
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft([]*Server{s1, s2}))
|
|
|
|
})
|
|
|
|
s2.Shutdown()
|
|
|
|
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft([]*Server{s1}))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2017-06-27 13:22:18 +00:00
|
|
|
t.Parallel()
|
2017-03-01 22:04:40 +00:00
|
|
|
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
|
|
|
c.Datacenter = "dc1"
|
|
|
|
c.Bootstrap = true
|
|
|
|
})
|
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
|
|
|
|
conf := func(c *Config) {
|
|
|
|
c.Datacenter = "dc1"
|
|
|
|
c.Bootstrap = false
|
|
|
|
}
|
2017-09-25 22:27:04 +00:00
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
dir2, s2 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
|
|
|
|
|
|
|
dir3, s3 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir3)
|
|
|
|
defer s3.Shutdown()
|
|
|
|
|
2017-03-10 19:41:17 +00:00
|
|
|
dir4, s4 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir4)
|
|
|
|
defer s4.Shutdown()
|
|
|
|
|
2017-09-25 22:27:04 +00:00
|
|
|
dir5, s5 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir5)
|
|
|
|
defer s5.Shutdown()
|
|
|
|
|
|
|
|
servers := []*Server{s1, s2, s3, s4, s5}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-09-25 22:27:04 +00:00
|
|
|
// Join the servers to s1, and wait until they are all promoted to
|
|
|
|
// voters.
|
2017-03-10 19:41:17 +00:00
|
|
|
for _, s := range servers[1:] {
|
2017-05-05 10:29:49 +00:00
|
|
|
joinLAN(t, s, s1)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-09-25 22:27:04 +00:00
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft(servers))
|
|
|
|
for _, s := range servers {
|
|
|
|
r.Check(wantPeers(s, 5))
|
|
|
|
}
|
|
|
|
})
|
2017-03-01 22:04:40 +00:00
|
|
|
|
|
|
|
// Kill a non-leader server
|
2017-03-10 19:41:17 +00:00
|
|
|
s4.Shutdown()
|
2017-03-01 22:04:40 +00:00
|
|
|
|
|
|
|
// Should be removed from the peers automatically
|
2017-09-25 22:27:04 +00:00
|
|
|
servers = []*Server{s1, s2, s3, s5}
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft(servers))
|
|
|
|
for _, s := range servers {
|
|
|
|
r.Check(wantPeers(s, 4))
|
|
|
|
}
|
|
|
|
})
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
2017-10-31 20:16:56 +00:00
|
|
|
func TestAutopilot_RollingUpdate(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2017-10-31 20:16:56 +00:00
|
|
|
t.Parallel()
|
|
|
|
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
|
|
|
c.Datacenter = "dc1"
|
|
|
|
c.Bootstrap = true
|
|
|
|
})
|
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
|
|
|
|
conf := func(c *Config) {
|
|
|
|
c.Datacenter = "dc1"
|
|
|
|
c.Bootstrap = false
|
|
|
|
}
|
|
|
|
|
|
|
|
dir2, s2 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
|
|
|
|
|
|
|
dir3, s3 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir3)
|
|
|
|
defer s3.Shutdown()
|
|
|
|
|
|
|
|
// Join the servers to s1, and wait until they are all promoted to
|
|
|
|
// voters.
|
|
|
|
servers := []*Server{s1, s2, s3}
|
|
|
|
for _, s := range servers[1:] {
|
|
|
|
joinLAN(t, s, s1)
|
|
|
|
}
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft(servers))
|
|
|
|
for _, s := range servers {
|
|
|
|
r.Check(wantPeers(s, 3))
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
// Add one more server like we are doing a rolling update.
|
|
|
|
dir4, s4 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir4)
|
|
|
|
defer s4.Shutdown()
|
|
|
|
joinLAN(t, s1, s4)
|
|
|
|
servers = append(servers, s4)
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft(servers))
|
|
|
|
for _, s := range servers {
|
|
|
|
r.Check(wantPeers(s, 3))
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
// Now kill one of the "old" nodes like we are doing a rolling update.
|
|
|
|
s3.Shutdown()
|
|
|
|
|
|
|
|
isVoter := func() bool {
|
|
|
|
future := s1.raft.GetConfiguration()
|
|
|
|
if err := future.Error(); err != nil {
|
|
|
|
t.Fatalf("err: %v", err)
|
|
|
|
}
|
|
|
|
for _, s := range future.Configuration().Servers {
|
|
|
|
if string(s.ID) == string(s4.config.NodeID) {
|
|
|
|
return s.Suffrage == raft.Voter
|
|
|
|
}
|
|
|
|
}
|
|
|
|
t.Fatalf("didn't find s4")
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
|
|
|
|
// removed.
|
|
|
|
servers = []*Server{s1, s2, s4}
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
r.Check(wantRaft(servers))
|
|
|
|
for _, s := range servers {
|
|
|
|
r.Check(wantPeers(s, 3))
|
|
|
|
}
|
|
|
|
if !isVoter() {
|
|
|
|
r.Fatalf("should be a voter")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-03-29 19:52:00 +00:00
|
|
|
func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2017-06-27 13:22:18 +00:00
|
|
|
t.Parallel()
|
2017-03-29 19:52:00 +00:00
|
|
|
dir1, s1 := testServerDCBootstrap(t, "dc1", true)
|
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
|
|
|
|
dir2, s2 := testServerDCBootstrap(t, "dc1", false)
|
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
|
|
|
|
|
|
|
dir3, s3 := testServerDCBootstrap(t, "dc1", false)
|
|
|
|
defer os.RemoveAll(dir3)
|
|
|
|
defer s3.Shutdown()
|
|
|
|
|
|
|
|
dir4, s4 := testServerDCBootstrap(t, "dc1", false)
|
|
|
|
defer os.RemoveAll(dir4)
|
|
|
|
defer s4.Shutdown()
|
|
|
|
|
|
|
|
servers := []*Server{s1, s2, s3}
|
|
|
|
|
|
|
|
// Join the servers to s1
|
|
|
|
for _, s := range servers[1:] {
|
2017-05-05 10:29:49 +00:00
|
|
|
joinLAN(t, s, s1)
|
2017-03-29 19:52:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, s := range servers {
|
2017-05-05 07:23:28 +00:00
|
|
|
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
2017-03-29 19:52:00 +00:00
|
|
|
}
|
|
|
|
|
2017-04-19 23:00:11 +00:00
|
|
|
testrpc.WaitForLeader(t, s1.RPC, "dc1")
|
2017-04-12 22:28:18 +00:00
|
|
|
|
2017-03-29 19:52:00 +00:00
|
|
|
// Add s4 to peers directly
|
2020-01-21 13:09:51 +00:00
|
|
|
addVoterFuture := s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(joinAddrLAN(s4)), 0, 0)
|
|
|
|
if err := addVoterFuture.Error(); err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
2017-03-29 19:52:00 +00:00
|
|
|
|
|
|
|
// Verify we have 4 peers
|
2020-09-25 17:46:38 +00:00
|
|
|
peers, err := s1.autopilot.NumVoters()
|
2017-03-29 19:52:00 +00:00
|
|
|
if err != nil {
|
|
|
|
t.Fatal(err)
|
|
|
|
}
|
|
|
|
if peers != 4 {
|
|
|
|
t.Fatalf("bad: %v", peers)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for s4 to be removed
|
|
|
|
for _, s := range []*Server{s1, s2, s3} {
|
2017-05-05 07:23:28 +00:00
|
|
|
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
2017-03-29 19:52:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
func TestAutopilot_PromoteNonVoter(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2017-06-27 13:22:18 +00:00
|
|
|
t.Parallel()
|
2017-03-01 22:04:40 +00:00
|
|
|
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
|
|
|
c.Datacenter = "dc1"
|
|
|
|
c.Bootstrap = true
|
|
|
|
c.AutopilotConfig.ServerStabilizationTime = 200 * time.Millisecond
|
|
|
|
c.ServerHealthInterval = 100 * time.Millisecond
|
2017-03-10 00:43:07 +00:00
|
|
|
c.AutopilotInterval = 100 * time.Millisecond
|
2017-03-01 22:04:40 +00:00
|
|
|
})
|
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
codec := rpcClient(t, s1)
|
|
|
|
defer codec.Close()
|
2017-10-31 20:16:56 +00:00
|
|
|
testrpc.WaitForLeader(t, s1.RPC, "dc1")
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2020-09-25 17:46:38 +00:00
|
|
|
// this may seem arbitrary but we need to get past the server stabilization time
|
|
|
|
// so that we start factoring in that time for newly connected nodes.
|
|
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
|
2017-03-01 22:04:40 +00:00
|
|
|
dir2, s2 := testServerWithConfig(t, func(c *Config) {
|
|
|
|
c.Datacenter = "dc1"
|
|
|
|
c.Bootstrap = false
|
|
|
|
c.RaftConfig.ProtocolVersion = 3
|
|
|
|
})
|
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
2017-05-05 10:29:49 +00:00
|
|
|
joinLAN(t, s2, s1)
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-10-31 20:16:56 +00:00
|
|
|
// Make sure we see it as a nonvoter initially. We wait until half
|
|
|
|
// the stabilization period has passed.
|
2017-05-05 10:14:43 +00:00
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
future := s1.raft.GetConfiguration()
|
|
|
|
if err := future.Error(); err != nil {
|
|
|
|
r.Fatal(err)
|
|
|
|
}
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-05-05 10:14:43 +00:00
|
|
|
servers := future.Configuration().Servers
|
|
|
|
if len(servers) != 2 {
|
|
|
|
r.Fatalf("bad: %v", servers)
|
|
|
|
}
|
|
|
|
if servers[1].Suffrage != raft.Nonvoter {
|
|
|
|
r.Fatalf("bad: %v", servers)
|
|
|
|
}
|
2020-09-25 17:46:38 +00:00
|
|
|
health := s1.autopilot.GetServerHealth(servers[1].ID)
|
2017-05-05 10:14:43 +00:00
|
|
|
if health == nil {
|
|
|
|
r.Fatal("nil health")
|
|
|
|
}
|
|
|
|
if !health.Healthy {
|
|
|
|
r.Fatalf("bad: %v", health)
|
|
|
|
}
|
2017-10-31 20:16:56 +00:00
|
|
|
if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
|
2017-05-05 10:14:43 +00:00
|
|
|
r.Fatal("stable period not elapsed")
|
|
|
|
}
|
|
|
|
})
|
2017-03-01 22:04:40 +00:00
|
|
|
|
2017-10-31 20:16:56 +00:00
|
|
|
// Make sure it ends up as a voter.
|
2017-05-04 22:52:53 +00:00
|
|
|
retry.Run(t, func(r *retry.R) {
|
2017-03-01 22:04:40 +00:00
|
|
|
future := s1.raft.GetConfiguration()
|
|
|
|
if err := future.Error(); err != nil {
|
2017-04-29 16:34:02 +00:00
|
|
|
r.Fatal(err)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
servers := future.Configuration().Servers
|
2017-10-31 20:16:56 +00:00
|
|
|
if len(servers) != 2 {
|
2017-04-29 16:34:02 +00:00
|
|
|
r.Fatalf("bad: %v", servers)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
|
|
|
if servers[1].Suffrage != raft.Voter {
|
2017-04-29 16:34:02 +00:00
|
|
|
r.Fatalf("bad: %v", servers)
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2017-04-29 16:34:02 +00:00
|
|
|
})
|
2017-03-01 22:04:40 +00:00
|
|
|
}
|
2019-10-29 14:04:41 +00:00
|
|
|
|
2020-02-10 20:52:58 +00:00
|
|
|
func TestAutopilot_MinQuorum(t *testing.T) {
|
2020-12-07 18:42:55 +00:00
|
|
|
if testing.Short() {
|
|
|
|
t.Skip("too slow for testing.Short")
|
|
|
|
}
|
|
|
|
|
2019-10-29 14:04:41 +00:00
|
|
|
dc := "dc1"
|
|
|
|
conf := func(c *Config) {
|
|
|
|
c.Datacenter = dc
|
|
|
|
c.Bootstrap = false
|
2020-02-10 20:52:58 +00:00
|
|
|
c.BootstrapExpect = 4
|
2019-10-29 14:04:41 +00:00
|
|
|
c.AutopilotConfig.MinQuorum = 3
|
|
|
|
c.AutopilotInterval = 100 * time.Millisecond
|
|
|
|
}
|
|
|
|
dir1, s1 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir1)
|
|
|
|
defer s1.Shutdown()
|
|
|
|
|
|
|
|
dir2, s2 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir2)
|
|
|
|
defer s2.Shutdown()
|
|
|
|
|
|
|
|
dir3, s3 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir3)
|
|
|
|
defer s3.Shutdown()
|
|
|
|
|
|
|
|
dir4, s4 := testServerWithConfig(t, conf)
|
|
|
|
defer os.RemoveAll(dir4)
|
|
|
|
defer s4.Shutdown()
|
|
|
|
|
|
|
|
servers := map[string]*Server{s1.config.NodeName: s1,
|
|
|
|
s2.config.NodeName: s2,
|
|
|
|
s3.config.NodeName: s3,
|
|
|
|
s4.config.NodeName: s4}
|
|
|
|
|
|
|
|
// Try to join
|
|
|
|
joinLAN(t, s2, s1)
|
|
|
|
joinLAN(t, s3, s1)
|
|
|
|
joinLAN(t, s4, s1)
|
|
|
|
|
|
|
|
//Differentiate between leader and server
|
|
|
|
findStatus := func(leader bool) *Server {
|
|
|
|
for _, mem := range servers {
|
2020-02-10 20:52:58 +00:00
|
|
|
if mem.IsLeader() == leader {
|
2019-10-29 14:04:41 +00:00
|
|
|
return mem
|
|
|
|
}
|
2020-02-10 20:52:58 +00:00
|
|
|
if !mem.IsLeader() == !leader {
|
2019-10-29 14:04:41 +00:00
|
|
|
return mem
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
2020-02-10 20:52:58 +00:00
|
|
|
testrpc.WaitForLeader(t, s1.RPC, dc)
|
2019-10-29 14:04:41 +00:00
|
|
|
|
|
|
|
// Have autopilot take one into left
|
|
|
|
dead := findStatus(false)
|
2020-02-10 20:52:58 +00:00
|
|
|
if dead == nil {
|
|
|
|
t.Fatalf("no members set")
|
|
|
|
}
|
2020-08-07 19:51:13 +00:00
|
|
|
require.NoError(t, dead.Shutdown())
|
2019-10-29 14:04:41 +00:00
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
leader := findStatus(true)
|
2020-02-10 20:52:58 +00:00
|
|
|
if leader == nil {
|
|
|
|
r.Fatalf("no members set")
|
|
|
|
}
|
2021-10-26 20:08:55 +00:00
|
|
|
for _, m := range leader.LANMembersInAgentPartition() {
|
2019-10-29 14:04:41 +00:00
|
|
|
if m.Name == dead.config.NodeName && m.Status != serf.StatusLeft {
|
|
|
|
r.Fatalf("%v should be left, got %v", m.Name, m.Status.String())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
delete(servers, dead.config.NodeName)
|
|
|
|
//Autopilot should not take this one into left
|
|
|
|
dead = findStatus(false)
|
2020-08-07 19:51:13 +00:00
|
|
|
require.NoError(t, dead.Shutdown())
|
2019-10-29 14:04:41 +00:00
|
|
|
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
|
|
leader := findStatus(true)
|
2020-02-10 20:52:58 +00:00
|
|
|
if leader == nil {
|
|
|
|
r.Fatalf("no members set")
|
|
|
|
}
|
2021-10-26 20:08:55 +00:00
|
|
|
for _, m := range leader.LANMembersInAgentPartition() {
|
2019-10-29 14:04:41 +00:00
|
|
|
if m.Name == dead.config.NodeName && m.Status != serf.StatusFailed {
|
|
|
|
r.Fatalf("%v should be failed, got %v", m.Name, m.Status.String())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
2022-04-19 17:03:03 +00:00
|
|
|
|
|
|
|
func TestAutopilot_EventPublishing(t *testing.T) {
|
|
|
|
// This is really an integration level test. The general flow this test will follow is:
|
|
|
|
//
|
|
|
|
// 1. Start a 3 server cluster
|
|
|
|
// 2. Subscribe to the ready server events
|
|
|
|
// 3. Observe the first event which will be pretty immediately ready as it is the
|
|
|
|
// snapshot event.
|
|
|
|
// 4. Wait for multiple iterations of the autopilot state updater and ensure no
|
|
|
|
// other events are seen. The state update interval is 50ms for tests unless
|
|
|
|
// overridden.
|
|
|
|
// 5. Add a fouth server.
|
|
|
|
// 6. Wait for an event to be emitted containing 4 ready servers.
|
|
|
|
|
|
|
|
// 1. create the test cluster
|
|
|
|
cluster := newTestCluster(t, &testClusterConfig{
|
|
|
|
Servers: 3,
|
|
|
|
ServerConf: testServerACLConfig,
|
|
|
|
// We want to wait until each server has registered itself in the Catalog. Otherwise
|
|
|
|
// the first snapshot even we see might have no servers in it while things are being
|
|
|
|
// initialized. Doing this wait ensure that things are in the right state to start
|
|
|
|
// the subscription.
|
|
|
|
})
|
|
|
|
|
|
|
|
// 2. subscribe to ready server events
|
|
|
|
req := stream.SubscribeRequest{
|
|
|
|
Topic: autopilotevents.EventTopicReadyServers,
|
|
|
|
Subject: stream.SubjectNone,
|
|
|
|
Token: TestDefaultInitialManagementToken,
|
|
|
|
}
|
|
|
|
sub, err := cluster.Servers[0].publisher.Subscribe(&req)
|
|
|
|
require.NoError(t, err)
|
|
|
|
t.Cleanup(sub.Unsubscribe)
|
|
|
|
|
|
|
|
// 3. Observe that an event was generated which should be the snapshot event.
|
|
|
|
// As we have just bootstrapped the cluster with 3 servers we expect to
|
|
|
|
// see those 3 here.
|
|
|
|
validatePayload(t, 3, mustGetEventWithTimeout(t, sub, 50*time.Millisecond))
|
|
|
|
|
|
|
|
// TODO - its kind of annoying that the EventPublisher doesn't have a mode where
|
|
|
|
// it knows each event is a full state of the world. The ramifications are that
|
|
|
|
// we have to expect/ignore the framing events for EndOfSnapshot.
|
|
|
|
event := mustGetEventWithTimeout(t, sub, 10*time.Millisecond)
|
|
|
|
require.True(t, event.IsFramingEvent())
|
|
|
|
|
|
|
|
// 4. Wait for 3 iterations of the ServerHealthInterval to ensure no events
|
|
|
|
// are being published when the autopilot state is not changing.
|
|
|
|
eventNotEmitted(t, sub, 150*time.Millisecond)
|
|
|
|
|
|
|
|
// 5. Add a fourth server
|
|
|
|
_, srv := testServerWithConfig(t, testServerACLConfig, func(c *Config) {
|
|
|
|
c.Bootstrap = false
|
|
|
|
c.BootstrapExpect = 0
|
|
|
|
})
|
|
|
|
joinLAN(t, srv, cluster.Servers[0])
|
|
|
|
|
|
|
|
// 6. Now wait for the event for the fourth server being added. This may take a little
|
|
|
|
// while as the joinLAN operation above doesn't wait for the server to actually get
|
|
|
|
// added to Raft.
|
|
|
|
validatePayload(t, 4, mustGetEventWithTimeout(t, sub, time.Second))
|
|
|
|
}
|
|
|
|
|
|
|
|
// mustGetEventWithTimeout is a helper function for validating that a Subscription.Next call will return
|
|
|
|
// an event within the given time. It also validates that no error is returned.
|
|
|
|
func mustGetEventWithTimeout(t *testing.T, subscription *stream.Subscription, timeout time.Duration) stream.Event {
|
|
|
|
t.Helper()
|
|
|
|
event, err := getEventWithTimeout(t, subscription, timeout)
|
|
|
|
require.NoError(t, err)
|
|
|
|
return event
|
|
|
|
}
|
|
|
|
|
|
|
|
// getEventWithTimeout is a helper function for retrieving a Event from a Subscription within the specified timeout.
|
|
|
|
func getEventWithTimeout(t *testing.T, subscription *stream.Subscription, timeout time.Duration) (stream.Event, error) {
|
|
|
|
t.Helper()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
|
|
defer cancel()
|
|
|
|
event, err := subscription.Next(ctx)
|
|
|
|
return event, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// eventNotEmitted is a helper to validate that no Event is emitted for the given Subscription
|
|
|
|
func eventNotEmitted(t *testing.T, subscription *stream.Subscription, timeout time.Duration) {
|
|
|
|
t.Helper()
|
|
|
|
var event stream.Event
|
|
|
|
var err error
|
|
|
|
event, err = getEventWithTimeout(t, subscription, timeout)
|
|
|
|
require.Equal(t, context.DeadlineExceeded, err, fmt.Sprintf("event:%v", event))
|
|
|
|
}
|
|
|
|
|
|
|
|
func validatePayload(t *testing.T, expectedNumServers int, event stream.Event) {
|
|
|
|
t.Helper()
|
|
|
|
require.Equal(t, autopilotevents.EventTopicReadyServers, event.Topic)
|
|
|
|
readyServers, ok := event.Payload.(autopilotevents.EventPayloadReadyServers)
|
|
|
|
require.True(t, ok)
|
|
|
|
require.Len(t, readyServers, expectedNumServers)
|
|
|
|
}
|