open-vault/command/operator_raft_autopilot_set_config.go
Vishal Nayak 3e55e79a3f
Autopilot: Server Stabilization, State and Dead Server Cleanup (#10856)
* k8s doc: update for 0.9.1 and 0.8.0 releases (#10825)

* k8s doc: update for 0.9.1 and 0.8.0 releases

* Update website/content/docs/platform/k8s/helm/configuration.mdx

Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>

Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>

* Autopilot initial commit

* Move autopilot related backend implementations to its own file

* Abstract promoter creation

* Add nil check for health

* Add server state oss no-ops

* Config ext stub for oss

* Make way for non-voters

* s/health/state

* s/ReadReplica/NonVoter

* Add synopsis and description

* Remove struct tags from AutopilotConfig

* Use var for config storage path

* Handle nin-config when reading

* Enable testing autopilot by using inmem cluster

* First passing test

* Only report the server as known if it is present in raft config

* Autopilot defaults to on for all existing and new clusters

* Add locking to some functions

* Persist initial config

* Clarify the command usage doc

* Add health metric for each node

* Fix audit logging issue

* Don't set DisablePerformanceStandby to true in test

* Use node id label for health metric

* Log updates to autopilot config

* Less aggressively consume config loading failures

* Return a mutable config

* Return early from known servers if raft config is unable to be pulled

* Update metrics name

* Reduce log level for potentially noisy log

* Add knob to disable autopilot

* Don't persist if default config is in use

* Autopilot: Dead server cleanup (#10857)

* Dead server cleanup

* Initialize channel in any case

* Fix a bunch of tests

* Fix panic

* Add follower locking in heartbeat tracker

* Add LastContactFailureThreshold to config

* Add log when marking node as dead

* Update follower state locking in heartbeat tracker

* Avoid follower states being nil

* Pull test to its own file

* Add execution status to state response

* Optionally enable autopilot in some tests

* Updates

* Added API function to fetch autopilot configuration

* Add test for default autopilot configuration

* Configuration tests

* Add State API test

* Update test

* Added TestClusterOptions.PhysicalFactoryConfig

* Update locking

* Adjust locking in heartbeat tracker

* s/last_contact_failure_threshold/left_server_last_contact_threshold

* Add disabling autopilot as a core config option

* Disable autopilot in some tests

* s/left_server_last_contact_threshold/dead_server_last_contact_threshold

* Set the lastheartbeat of followers to now when setting up active node

* Don't use config defaults from CLI command

* Remove config file support

* Remove HCL test as well

* Persist only supplied config; merge supplied config with default to operate

* Use pointer to structs for storing follower information

* Test update

* Retrieve non voter status from configbucket and set it up when a node comes up

* Manage desired suffrage

* Consider bucket being created already

* Move desired suffrage to its own entry

* s/DesiredSuffrageKey/LocalNodeConfigKey

* s/witnessSuffrage/recordSuffrage

* Fix test compilation

* Handle local node config post a snapshot install

* Commit to storage first; then record suffrage in fsm

* No need of local node config being nili case, post snapshot restore

* Reconcile autopilot config when a new leader takes over duty

* Grab fsm lock when recording suffrage

* s/Suffrage/DesiredSuffrage in FollowerState

* Instantiate autopilot only in leader

* Default to old ways in more scenarios

* Make API gracefully handle 404

* Address some feedback

* Make IsDead an atomic.Value

* Simplify follower hearbeat tracking

* Use uber.atomic

* Don't have multiple causes for having autopilot disabled

* Don't remove node from follower states if we fail to remove the dead server

* Autopilot server removals map (#11019)

* Don't remove node from follower states if we fail to remove the dead server

* Use map to track dead server removals

* Use lock and map

* Use delegate lock

* Adjust when to remove entry from map

* Only hold the lock while accessing map

* Fix race

* Don't set default min_quorum

* Fix test

* Ensure follower states is not nil before starting autopilot

* Fix race

Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>
Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
2021-03-03 13:59:50 -05:00

138 lines
3.3 KiB
Go

package command
import (
"fmt"
"strings"
"time"
"github.com/mitchellh/cli"
"github.com/posener/complete"
)
var _ cli.Command = (*OperatorRaftAutopilotSetConfigCommand)(nil)
var _ cli.CommandAutocomplete = (*OperatorRaftAutopilotSetConfigCommand)(nil)
type OperatorRaftAutopilotSetConfigCommand struct {
*BaseCommand
flagCleanupDeadServers BoolPtr
flagLastContactThreshold time.Duration
flagDeadServerLastContactThreshold time.Duration
flagMaxTrailingLogs uint64
flagMinQuorum uint
flagServerStabilizationTime time.Duration
}
func (c *OperatorRaftAutopilotSetConfigCommand) Synopsis() string {
return "Modify the configuration of the autopilot subsystem under integrated storage"
}
func (c *OperatorRaftAutopilotSetConfigCommand) Help() string {
helpText := `
Usage: vault operator raft autopilot set-config [options]
Modify the configuration of the autopilot subsystem under integrated storage.
` + c.Flags().Help()
return strings.TrimSpace(helpText)
}
func (c *OperatorRaftAutopilotSetConfigCommand) Flags() *FlagSets {
set := c.flagSet(FlagSetHTTP | FlagSetOutputFormat)
f := set.NewFlagSet("Common Options")
f.BoolPtrVar(&BoolPtrVar{
Name: "cleanup-dead-servers",
Target: &c.flagCleanupDeadServers,
})
f.DurationVar(&DurationVar{
Name: "last-contact-threshold",
Target: &c.flagLastContactThreshold,
})
f.DurationVar(&DurationVar{
Name: "dead-server-last-contact-threshold",
Target: &c.flagDeadServerLastContactThreshold,
})
f.Uint64Var(&Uint64Var{
Name: "max-trailing-logs",
Target: &c.flagMaxTrailingLogs,
})
f.UintVar(&UintVar{
Name: "min-quorum",
Target: &c.flagMinQuorum,
})
f.DurationVar(&DurationVar{
Name: "server-stabilization-time",
Target: &c.flagServerStabilizationTime,
})
return set
}
func (c *OperatorRaftAutopilotSetConfigCommand) AutocompleteArgs() complete.Predictor {
return complete.PredictAnything
}
func (c *OperatorRaftAutopilotSetConfigCommand) AutocompleteFlags() complete.Flags {
return c.Flags().Completions()
}
func (c *OperatorRaftAutopilotSetConfigCommand) Run(args []string) int {
f := c.Flags()
if err := f.Parse(args); err != nil {
c.UI.Error(err.Error())
return 1
}
args = f.Args()
switch len(args) {
case 0:
default:
c.UI.Error(fmt.Sprintf("Incorrect arguments (expected 0, got %d)", len(args)))
return 1
}
client, err := c.Client()
if err != nil {
c.UI.Error(err.Error())
return 2
}
data := make(map[string]interface{})
if c.flagCleanupDeadServers.IsSet() {
data["cleanup_dead_servers"] = c.flagCleanupDeadServers.Get()
}
if c.flagMaxTrailingLogs > 0 {
data["max_trailing_logs"] = c.flagMaxTrailingLogs
}
if c.flagMinQuorum > 0 {
data["min_quorum"] = c.flagMinQuorum
}
if c.flagLastContactThreshold > 0 {
data["last_contact_threshold"] = c.flagLastContactThreshold.String()
}
if c.flagDeadServerLastContactThreshold > 0 {
data["dead_server_last_contact_threshold"] = c.flagDeadServerLastContactThreshold.String()
}
if c.flagServerStabilizationTime > 0 {
data["server_stabilization_time"] = c.flagServerStabilizationTime.String()
}
secret, err := client.Logical().Write("sys/storage/raft/autopilot/configuration", data)
if err != nil {
c.UI.Error(err.Error())
return 2
}
if secret == nil {
return 0
}
return OutputSecret(c.UI, secret)
}