Set MinQuorum variable in Autopilot (#6654)

* Add MinQuorum to Autopilot
This commit is contained in:
Sarah Christoff 2019-10-29 09:04:41 -05:00 committed by GitHub
parent 969d51781a
commit 86b30bbfbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 149 additions and 3 deletions

View File

@ -1204,6 +1204,7 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
base.AutopilotConfig.MinQuorum = a.config.AutopilotMinQuorum
base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration

View File

@ -728,6 +728,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
AutopilotDisableUpgradeMigration: b.boolVal(c.Autopilot.DisableUpgradeMigration),
AutopilotLastContactThreshold: b.durationVal("autopilot.last_contact_threshold", c.Autopilot.LastContactThreshold),
AutopilotMaxTrailingLogs: b.intVal(c.Autopilot.MaxTrailingLogs),
AutopilotMinQuorum: b.uintVal(c.Autopilot.MinQuorum),
AutopilotRedundancyZoneTag: b.stringVal(c.Autopilot.RedundancyZoneTag),
AutopilotServerStabilizationTime: b.durationVal("autopilot.server_stabilization_time", c.Autopilot.ServerStabilizationTime),
AutopilotUpgradeVersionTag: b.stringVal(c.Autopilot.UpgradeVersionTag),
@ -1444,6 +1445,17 @@ func (b *Builder) intVal(v *int) int {
return b.intValWithDefault(v, 0)
}
func (b *Builder) uintVal(v *uint) uint {
return b.uintValWithDefault(v, 0)
}
func (b *Builder) uintValWithDefault(v *uint, defaultVal uint) uint {
if v == nil {
return defaultVal
}
return *v
}
func (b *Builder) uint64ValWithDefault(v *uint64, defaultVal uint64) uint64 {
if v == nil {
return defaultVal

View File

@ -362,6 +362,7 @@ type Autopilot struct {
DisableUpgradeMigration *bool `json:"disable_upgrade_migration,omitempty" hcl:"disable_upgrade_migration" mapstructure:"disable_upgrade_migration"`
LastContactThreshold *string `json:"last_contact_threshold,omitempty" hcl:"last_contact_threshold" mapstructure:"last_contact_threshold"`
MaxTrailingLogs *int `json:"max_trailing_logs,omitempty" hcl:"max_trailing_logs" mapstructure:"max_trailing_logs"`
MinQuorum *uint `json:"min_quorum,omitempty" hcl:"min_quorum" mapstructure:"min_quorum"`
RedundancyZoneTag *string `json:"redundancy_zone_tag,omitempty" hcl:"redundancy_zone_tag" mapstructure:"redundancy_zone_tag"`
ServerStabilizationTime *string `json:"server_stabilization_time,omitempty" hcl:"server_stabilization_time" mapstructure:"server_stabilization_time"`
UpgradeVersionTag *string `json:"upgrade_version_tag,omitempty" hcl:"upgrade_version_tag" mapstructure:"upgrade_version_tag"`

View File

@ -196,6 +196,12 @@ type RuntimeConfig struct {
// hcl: autopilot { max_trailing_logs = int }
AutopilotMaxTrailingLogs int
// AutopilotMinQuorum sets the minimum number of servers required in a cluster
// before autopilot can prune dead servers.
//
//hcl: autopilot { min_quorum = int }
AutopilotMinQuorum uint
// AutopilotRedundancyZoneTag is the Meta tag to use for separating servers
// into zones for redundancy. If left blank, this feature will be disabled.
// (Enterprise-only)

View File

@ -3520,6 +3520,7 @@ func TestFullConfig(t *testing.T) {
"disable_upgrade_migration": true,
"last_contact_threshold": "12705s",
"max_trailing_logs": 17849,
"min_quorum": 3,
"redundancy_zone_tag": "3IsufDJf",
"server_stabilization_time": "23057s",
"upgrade_version_tag": "W9pDwFAL"
@ -4117,6 +4118,7 @@ func TestFullConfig(t *testing.T) {
disable_upgrade_migration = true
last_contact_threshold = "12705s"
max_trailing_logs = 17849
min_quorum = 3
redundancy_zone_tag = "3IsufDJf"
server_stabilization_time = "23057s"
upgrade_version_tag = "W9pDwFAL"
@ -4819,6 +4821,7 @@ func TestFullConfig(t *testing.T) {
AutopilotDisableUpgradeMigration: true,
AutopilotLastContactThreshold: 12705 * time.Second,
AutopilotMaxTrailingLogs: 17849,
AutopilotMinQuorum: 3,
AutopilotRedundancyZoneTag: "3IsufDJf",
AutopilotServerStabilizationTime: 23057 * time.Second,
AutopilotUpgradeVersionTag: "W9pDwFAL",
@ -5703,6 +5706,7 @@ func TestSanitize(t *testing.T) {
"AutopilotDisableUpgradeMigration": false,
"AutopilotLastContactThreshold": "0s",
"AutopilotMaxTrailingLogs": 0,
"AutopilotMinQuorum": 0,
"AutopilotRedundancyZoneTag": "",
"AutopilotServerStabilizationTime": "0s",
"AutopilotUpgradeVersionTag": "",

View File

@ -234,7 +234,7 @@ func (a *Autopilot) pruneDeadServers() error {
// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
if removalCount < peers/2 {
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)

View File

@ -20,6 +20,10 @@ type Config struct {
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
// MinQuorum sets the minimum number of servers required in a cluster
// before autopilot can prune dead servers.
MinQuorum uint
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.

View File

@ -369,3 +369,97 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
}
})
}
func TestAutopilot_BootstrapExpect(t *testing.T) {
dc := "dc1"
closeMap := make(map[string]chan struct{})
conf := func(c *Config) {
c.Datacenter = dc
c.Bootstrap = false
c.BootstrapExpect = 3
c.AutopilotConfig.MinQuorum = 3
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(2)
c.AutopilotInterval = 100 * time.Millisecond
//Let us know when a server is actually gone
ch := make(chan struct{})
c.NotifyShutdown = func() {
t.Logf("%v is shutdown", c.NodeName)
close(ch)
}
closeMap[c.NodeName] = ch
}
dir1, s1 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
servers := map[string]*Server{s1.config.NodeName: s1,
s2.config.NodeName: s2,
s3.config.NodeName: s3,
s4.config.NodeName: s4}
// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)
//Differentiate between leader and server
findStatus := func(leader bool) *Server {
for _, mem := range servers {
if mem.IsLeader() && leader {
return mem
}
if !leader && !mem.IsLeader() {
return mem
}
}
t.Fatalf("no members set")
return nil
}
for _, s := range servers {
testrpc.WaitForLeader(t, s.RPC, dc)
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 4)) })
}
// Have autopilot take one into left
dead := findStatus(false)
dead.Shutdown()
<-closeMap[dead.config.NodeName]
retry.Run(t, func(r *retry.R) {
leader := findStatus(true)
for _, m := range leader.LANMembers() {
if m.Name == dead.config.NodeName && m.Status != serf.StatusLeft {
r.Fatalf("%v should be left, got %v", m.Name, m.Status.String())
}
}
})
delete(servers, dead.config.NodeName)
//Autopilot should not take this one into left
dead = findStatus(false)
dead.Shutdown()
<-closeMap[dead.config.NodeName]
retry.Run(t, func(r *retry.R) {
leader := findStatus(true)
for _, m := range leader.LANMembers() {
if m.Name == dead.config.NodeName && m.Status != serf.StatusFailed {
r.Fatalf("%v should be failed, got %v", m.Name, m.Status.String())
}
}
})
}

View File

@ -11,7 +11,7 @@ import (
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/net-rpc-msgpackrpc"
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/raft"
)
@ -116,6 +116,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
Datacenter: "dc1",
Config: autopilot.Config{
CleanupDeadServers: true,
MinQuorum: 3,
},
}
var reply *bool
@ -130,7 +131,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if !config.CleanupDeadServers {
if !config.CleanupDeadServers && config.MinQuorum != 3 {
t.Fatalf("bad: %#v", config)
}
}

View File

@ -201,6 +201,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
CleanupDeadServers: reply.CleanupDeadServers,
LastContactThreshold: api.NewReadableDuration(reply.LastContactThreshold),
MaxTrailingLogs: reply.MaxTrailingLogs,
MinQuorum: reply.MinQuorum,
ServerStabilizationTime: api.NewReadableDuration(reply.ServerStabilizationTime),
RedundancyZoneTag: reply.RedundancyZoneTag,
DisableUpgradeMigration: reply.DisableUpgradeMigration,
@ -226,6 +227,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
CleanupDeadServers: conf.CleanupDeadServers,
LastContactThreshold: conf.LastContactThreshold.Duration(),
MaxTrailingLogs: conf.MaxTrailingLogs,
MinQuorum: conf.MinQuorum,
ServerStabilizationTime: conf.ServerStabilizationTime.Duration(),
RedundancyZoneTag: conf.RedundancyZoneTag,
DisableUpgradeMigration: conf.DisableUpgradeMigration,

View File

@ -25,6 +25,10 @@ type AutopilotConfiguration struct {
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
// MinQuorum sets the minimum number of servers allowed in a cluster before
// autopilot can prune dead servers.
MinQuorum uint
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.

View File

@ -58,6 +58,7 @@ func (c *cmd) Run(args []string) int {
c.UI.Output(fmt.Sprintf("CleanupDeadServers = %v", config.CleanupDeadServers))
c.UI.Output(fmt.Sprintf("LastContactThreshold = %v", config.LastContactThreshold.String()))
c.UI.Output(fmt.Sprintf("MaxTrailingLogs = %v", config.MaxTrailingLogs))
c.UI.Output(fmt.Sprintf("MinQuorum = %v", config.MinQuorum))
c.UI.Output(fmt.Sprintf("ServerStabilizationTime = %v", config.ServerStabilizationTime.String()))
c.UI.Output(fmt.Sprintf("RedundancyZoneTag = %q", config.RedundancyZoneTag))
c.UI.Output(fmt.Sprintf("DisableUpgradeMigration = %v", config.DisableUpgradeMigration))

View File

@ -25,6 +25,7 @@ type cmd struct {
// flags
cleanupDeadServers flags.BoolValue
maxTrailingLogs flags.UintValue
minQuorum flags.UintValue
lastContactThreshold flags.DurationValue
serverStabilizationTime flags.DurationValue
redundancyZoneTag flags.StringValue
@ -40,6 +41,9 @@ func (c *cmd) init() {
c.flags.Var(&c.maxTrailingLogs, "max-trailing-logs",
"Controls the maximum number of log entries that a server can trail the "+
"leader by before being considered unhealthy.")
c.flags.Var(&c.minQuorum, "min-quorum",
"Sets the minimum number of of servers required in a cluster before autopilot "+
"is allowed to prune dead servers.")
c.flags.Var(&c.lastContactThreshold, "last-contact-threshold",
"Controls the maximum amount of time a server can go without contact "+
"from the leader before being considered unhealthy. Must be a duration value "+
@ -94,6 +98,7 @@ func (c *cmd) Run(args []string) int {
c.redundancyZoneTag.Merge(&conf.RedundancyZoneTag)
c.disableUpgradeMigration.Merge(&conf.DisableUpgradeMigration)
c.upgradeVersionTag.Merge(&conf.UpgradeVersionTag)
c.minQuorum.Merge(&conf.MinQuorum)
trailing := uint(conf.MaxTrailingLogs)
c.maxTrailingLogs.Merge(&trailing)

View File

@ -34,6 +34,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
"-max-trailing-logs=99",
"-last-contact-threshold=123ms",
"-server-stabilization-time=123ms",
"-min-quorum=3",
}
code := c.Run(args)
@ -65,4 +66,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
if reply.ServerStabilizationTime != 123*time.Millisecond {
t.Fatalf("bad: %#v", reply)
}
if reply.MinQuorum != 3 {
t.Fatalf("bad: %#v", reply)
}
}

View File

@ -108,6 +108,9 @@ The table below shows this endpoint's support for
- `MaxTrailingLogs` `(int: 250)` specifies the maximum number of log entries
that a server can trail the leader by before being considered unhealthy.
- `MinQuorum` `int: 0` - specifies the minimum number of servers needed before
Autopilot can prune dead servers.
- `ServerStabilizationTime` `(string: "10s")` - Specifies the minimum amount of
time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3
@ -134,6 +137,7 @@ The table below shows this endpoint's support for
"CleanupDeadServers": true,
"LastContactThreshold": "200ms",
"MaxTrailingLogs": 250,
"MinQuorum": 3,
"ServerStabilizationTime": "10s",
"RedundancyZoneTag": "",
"DisableUpgradeMigration": false,

View File

@ -782,6 +782,9 @@ default will automatically work with some tooling.
the maximum number of log entries that a server can trail the leader by before being considered unhealthy. Defaults
to 250.
* <a name="min_quorum"></a><a href="#min_quorum">`min_quorum`</a> - Sets the minimum number of servers necessary in a cluster
before autopilot can prune dead servers. There is no default.
* <a name="server_stabilization_time"></a><a href="#server_stabilization_time">`server_stabilization_time`</a> -
Controls the minimum amount of time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3 or higher. Must be a duration value