Autopilot: Server Stabilization, State and Dead Server Cleanup (#10856)
* k8s doc: update for 0.9.1 and 0.8.0 releases (#10825) * k8s doc: update for 0.9.1 and 0.8.0 releases * Update website/content/docs/platform/k8s/helm/configuration.mdx Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> * Autopilot initial commit * Move autopilot related backend implementations to its own file * Abstract promoter creation * Add nil check for health * Add server state oss no-ops * Config ext stub for oss * Make way for non-voters * s/health/state * s/ReadReplica/NonVoter * Add synopsis and description * Remove struct tags from AutopilotConfig * Use var for config storage path * Handle nin-config when reading * Enable testing autopilot by using inmem cluster * First passing test * Only report the server as known if it is present in raft config * Autopilot defaults to on for all existing and new clusters * Add locking to some functions * Persist initial config * Clarify the command usage doc * Add health metric for each node * Fix audit logging issue * Don't set DisablePerformanceStandby to true in test * Use node id label for health metric * Log updates to autopilot config * Less aggressively consume config loading failures * Return a mutable config * Return early from known servers if raft config is unable to be pulled * Update metrics name * Reduce log level for potentially noisy log * Add knob to disable autopilot * Don't persist if default config is in use * Autopilot: Dead server cleanup (#10857) * Dead server cleanup * Initialize channel in any case * Fix a bunch of tests * Fix panic * Add follower locking in heartbeat tracker * Add LastContactFailureThreshold to config * Add log when marking node as dead * Update follower state locking in heartbeat tracker * Avoid follower states being nil * Pull test to its own file * Add execution status to state response * Optionally enable autopilot in some tests * Updates * Added API function to fetch autopilot configuration * Add test for default autopilot configuration * Configuration tests * Add State API test * Update test * Added TestClusterOptions.PhysicalFactoryConfig * Update locking * Adjust locking in heartbeat tracker * s/last_contact_failure_threshold/left_server_last_contact_threshold * Add disabling autopilot as a core config option * Disable autopilot in some tests * s/left_server_last_contact_threshold/dead_server_last_contact_threshold * Set the lastheartbeat of followers to now when setting up active node * Don't use config defaults from CLI command * Remove config file support * Remove HCL test as well * Persist only supplied config; merge supplied config with default to operate * Use pointer to structs for storing follower information * Test update * Retrieve non voter status from configbucket and set it up when a node comes up * Manage desired suffrage * Consider bucket being created already * Move desired suffrage to its own entry * s/DesiredSuffrageKey/LocalNodeConfigKey * s/witnessSuffrage/recordSuffrage * Fix test compilation * Handle local node config post a snapshot install * Commit to storage first; then record suffrage in fsm * No need of local node config being nili case, post snapshot restore * Reconcile autopilot config when a new leader takes over duty * Grab fsm lock when recording suffrage * s/Suffrage/DesiredSuffrage in FollowerState * Instantiate autopilot only in leader * Default to old ways in more scenarios * Make API gracefully handle 404 * Address some feedback * Make IsDead an atomic.Value * Simplify follower hearbeat tracking * Use uber.atomic * Don't have multiple causes for having autopilot disabled * Don't remove node from follower states if we fail to remove the dead server * Autopilot server removals map (#11019) * Don't remove node from follower states if we fail to remove the dead server * Use map to track dead server removals * Use lock and map * Use delegate lock * Adjust when to remove entry from map * Only hold the lock while accessing map * Fix race * Don't set default min_quorum * Fix test * Ensure follower states is not nil before starting autopilot * Fix race Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com> Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
This commit is contained in:
parent
9741f51bee
commit
3e55e79a3f
154
api/sys_raft.go
154
api/sys_raft.go
|
@ -2,9 +2,16 @@ package api
|
|||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/helper/parseutil"
|
||||
|
||||
"github.com/mitchellh/mapstructure"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/helper/consts"
|
||||
)
|
||||
|
@ -27,6 +34,77 @@ type RaftJoinRequest struct {
|
|||
NonVoter bool `json:"non_voter"`
|
||||
}
|
||||
|
||||
// AutopilotConfig is used for querying/setting the Autopilot configuration.
|
||||
type AutopilotConfig struct {
|
||||
CleanupDeadServers bool `json:"cleanup_dead_servers" mapstructure:"cleanup_dead_servers"`
|
||||
LastContactThreshold time.Duration `json:"last_contact_threshold" mapstructure:"-"`
|
||||
DeadServerLastContactThreshold time.Duration `json:"dead_server_last_contact_threshold" mapstructure:"-"`
|
||||
MaxTrailingLogs uint64 `json:"max_trailing_logs" mapstructure:"max_trailing_logs"`
|
||||
MinQuorum uint `json:"min_quorum" mapstructure:"min_quorum"`
|
||||
ServerStabilizationTime time.Duration `json:"server_stabilization_time" mapstructure:"-"`
|
||||
}
|
||||
|
||||
// UnmarshalJSON parses the autopilot config JSON blob
|
||||
func (ac *AutopilotConfig) UnmarshalJSON(b []byte) error {
|
||||
var data interface{}
|
||||
err := json.Unmarshal(b, &data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
conf := data.(map[string]interface{})
|
||||
if err = mapstructure.WeakDecode(conf, ac); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.LastContactThreshold, err = parseutil.ParseDurationSecond(conf["last_contact_threshold"]); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.DeadServerLastContactThreshold, err = parseutil.ParseDurationSecond(conf["dead_server_last_contact_threshold"]); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.ServerStabilizationTime, err = parseutil.ParseDurationSecond(conf["server_stabilization_time"]); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AutopilotExecutionStatus represents the current status of the autopilot background go routines
|
||||
type AutopilotExecutionStatus string
|
||||
|
||||
const (
|
||||
AutopilotNotRunning AutopilotExecutionStatus = "not-running"
|
||||
AutopilotRunning AutopilotExecutionStatus = "running"
|
||||
AutopilotShuttingDown AutopilotExecutionStatus = "shutting-down"
|
||||
)
|
||||
|
||||
// AutopilotState represents the response of the raft autopilot state API
|
||||
type AutopilotState struct {
|
||||
ExecutionStatus AutopilotExecutionStatus `mapstructure:"execution_status"`
|
||||
Healthy bool `mapstructure:"healthy"`
|
||||
FailureTolerance int `mapstructure:"failure_tolerance"`
|
||||
OptimisticFailureTolerance int `mapstructure:"optimistic_failure_tolerance"`
|
||||
Servers map[string]*AutopilotServer `mapstructure:"servers"`
|
||||
Leader string `mapstructure:"leader"`
|
||||
Voters []string `mapstructure:"voters"`
|
||||
NonVoters []string `mapstructure:"non_voters"`
|
||||
}
|
||||
|
||||
// AutopilotServer represents the server blocks in the response of the raft
|
||||
// autopilot state API.
|
||||
type AutopilotServer struct {
|
||||
ID string `mapstructure:"id"`
|
||||
Name string `mapstructure:"name"`
|
||||
Address string `mapstructure:"address"`
|
||||
NodeStatus string `mapstructure:"node_status"`
|
||||
LastContact string `mapstructure:"last_contact"`
|
||||
LastTerm uint64 `mapstructure:"last_term"`
|
||||
LastIndex uint64 `mapstructure:"last_index"`
|
||||
Healthy bool `mapstructure:"healthy"`
|
||||
StableSince string `mapstructure:"stable_since"`
|
||||
Status string `mapstructure:"status"`
|
||||
Meta map[string]string `mapstructure:"meta"`
|
||||
}
|
||||
|
||||
// RaftJoin adds the node from which this call is invoked from to the raft
|
||||
// cluster represented by the leader address in the parameter.
|
||||
func (c *Sys) RaftJoin(opts *RaftJoinRequest) (*RaftJoinResponse, error) {
|
||||
|
@ -160,3 +238,79 @@ func (c *Sys) RaftSnapshotRestore(snapReader io.Reader, force bool) error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RaftAutopilotState returns the state of the raft cluster as seen by autopilot.
|
||||
func (c *Sys) RaftAutopilotState() (*AutopilotState, error) {
|
||||
r := c.c.NewRequest("GET", "/v1/sys/storage/raft/autopilot/state")
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
defer cancelFunc()
|
||||
resp, err := c.c.RawRequestWithContext(ctx, r)
|
||||
if resp != nil {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
secret, err := ParseSecret(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if secret == nil || secret.Data == nil {
|
||||
return nil, errors.New("data from server response is empty")
|
||||
}
|
||||
|
||||
var result AutopilotState
|
||||
err = mapstructure.Decode(secret.Data, &result)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, err
|
||||
}
|
||||
|
||||
// RaftAutopilotConfiguration fetches the autopilot config.
|
||||
func (c *Sys) RaftAutopilotConfiguration() (*AutopilotConfig, error) {
|
||||
r := c.c.NewRequest("GET", "/v1/sys/storage/raft/autopilot/configuration")
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
defer cancelFunc()
|
||||
resp, err := c.c.RawRequestWithContext(ctx, r)
|
||||
if resp != nil {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
secret, err := ParseSecret(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if secret == nil {
|
||||
return nil, errors.New("data from server response is empty")
|
||||
}
|
||||
|
||||
var result AutopilotConfig
|
||||
if err = mapstructure.Decode(secret.Data, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if result.LastContactThreshold, err = parseutil.ParseDurationSecond(secret.Data["last_contact_threshold"]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if result.DeadServerLastContactThreshold, err = parseutil.ParseDurationSecond(secret.Data["dead_server_last_contact_threshold"]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if result.ServerStabilizationTime, err = parseutil.ParseDurationSecond(secret.Data["server_stabilization_time"]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, err
|
||||
}
|
||||
|
|
|
@ -380,9 +380,9 @@ func (c *BaseCommand) flagSet(bit FlagSetBit) *FlagSets {
|
|||
Target: &c.flagFormat,
|
||||
Default: "table",
|
||||
EnvVar: EnvVaultFormat,
|
||||
Completion: complete.PredictSet("table", "json", "yaml"),
|
||||
Usage: "Print the output in the given format. Valid formats " +
|
||||
"are \"table\", \"json\", or \"yaml\".",
|
||||
Completion: complete.PredictSet("table", "json", "yaml", "pretty"),
|
||||
Usage: `Print the output in the given format. Valid formats
|
||||
are "table", "json", "yaml", or "pretty".`,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -359,6 +359,21 @@ func initCommands(ui, serverCmdUi cli.Ui, runOpts *RunOptions) {
|
|||
BaseCommand: getBaseCommand(),
|
||||
}, nil
|
||||
},
|
||||
"operator raft autopilot get-config": func() (cli.Command, error) {
|
||||
return &OperatorRaftAutopilotGetConfigCommand{
|
||||
BaseCommand: getBaseCommand(),
|
||||
}, nil
|
||||
},
|
||||
"operator raft autopilot set-config": func() (cli.Command, error) {
|
||||
return &OperatorRaftAutopilotSetConfigCommand{
|
||||
BaseCommand: getBaseCommand(),
|
||||
}, nil
|
||||
},
|
||||
"operator raft autopilot state": func() (cli.Command, error) {
|
||||
return &OperatorRaftAutopilotStateCommand{
|
||||
BaseCommand: getBaseCommand(),
|
||||
}, nil
|
||||
},
|
||||
"operator raft list-peers": func() (cli.Command, error) {
|
||||
return &OperatorRaftListPeersCommand{
|
||||
BaseCommand: getBaseCommand(),
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package command
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
|
@ -64,10 +65,11 @@ type Formatter interface {
|
|||
}
|
||||
|
||||
var Formatters = map[string]Formatter{
|
||||
"json": JsonFormatter{},
|
||||
"table": TableFormatter{},
|
||||
"yaml": YamlFormatter{},
|
||||
"yml": YamlFormatter{},
|
||||
"json": JsonFormatter{},
|
||||
"table": TableFormatter{},
|
||||
"yaml": YamlFormatter{},
|
||||
"yml": YamlFormatter{},
|
||||
"pretty": PrettyFormatter{},
|
||||
}
|
||||
|
||||
func Format(ui cli.Ui) string {
|
||||
|
@ -115,6 +117,98 @@ func (y YamlFormatter) Output(ui cli.Ui, secret *api.Secret, data interface{}) e
|
|||
return err
|
||||
}
|
||||
|
||||
type PrettyFormatter struct{}
|
||||
|
||||
func (p PrettyFormatter) Format(data interface{}) ([]byte, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (p PrettyFormatter) Output(ui cli.Ui, secret *api.Secret, data interface{}) error {
|
||||
switch data.(type) {
|
||||
case *api.AutopilotState:
|
||||
p.OutputAutopilotState(ui, data)
|
||||
default:
|
||||
return errors.New("cannot use the pretty formatter for this type")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputStringSlice(buffer *bytes.Buffer, indent string, values []string) {
|
||||
for _, val := range values {
|
||||
buffer.WriteString(fmt.Sprintf("%s%s\n", indent, val))
|
||||
}
|
||||
}
|
||||
|
||||
type mapOutput struct {
|
||||
key string
|
||||
value string
|
||||
}
|
||||
|
||||
func formatServer(srv *api.AutopilotServer) string {
|
||||
var buffer bytes.Buffer
|
||||
|
||||
buffer.WriteString(fmt.Sprintf(" %s\n", srv.ID))
|
||||
buffer.WriteString(fmt.Sprintf(" Name: %s\n", srv.Name))
|
||||
buffer.WriteString(fmt.Sprintf(" Address: %s\n", srv.Address))
|
||||
buffer.WriteString(fmt.Sprintf(" Status: %s\n", srv.Status))
|
||||
buffer.WriteString(fmt.Sprintf(" Node Status: %s\n", srv.NodeStatus))
|
||||
buffer.WriteString(fmt.Sprintf(" Healthy: %t\n", srv.Healthy))
|
||||
buffer.WriteString(fmt.Sprintf(" Last Contact: %s\n", srv.LastContact))
|
||||
buffer.WriteString(fmt.Sprintf(" Last Term: %d\n", srv.LastTerm))
|
||||
buffer.WriteString(fmt.Sprintf(" Last Index: %d\n", srv.LastIndex))
|
||||
|
||||
if len(srv.Meta) > 0 {
|
||||
buffer.WriteString(fmt.Sprintf(" Meta\n"))
|
||||
var outputs []mapOutput
|
||||
for k, v := range srv.Meta {
|
||||
outputs = append(outputs, mapOutput{key: k, value: fmt.Sprintf(" %q: %q\n", k, v)})
|
||||
}
|
||||
|
||||
sort.Slice(outputs, func(i, j int) bool {
|
||||
return outputs[i].key < outputs[j].key
|
||||
})
|
||||
|
||||
for _, output := range outputs {
|
||||
buffer.WriteString(output.value)
|
||||
}
|
||||
}
|
||||
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
func (p PrettyFormatter) OutputAutopilotState(ui cli.Ui, data interface{}) {
|
||||
state := data.(*api.AutopilotState)
|
||||
|
||||
var buffer bytes.Buffer
|
||||
buffer.WriteString(fmt.Sprintf("Healthy: %t\n", state.Healthy))
|
||||
buffer.WriteString(fmt.Sprintf("Failure Tolerance: %d\n", state.FailureTolerance))
|
||||
buffer.WriteString(fmt.Sprintf("Optimistic Failure Tolerance: %d\n", state.OptimisticFailureTolerance))
|
||||
buffer.WriteString(fmt.Sprintf("Leader: %s\n", state.Leader))
|
||||
buffer.WriteString("Voters:\n")
|
||||
outputStringSlice(&buffer, " ", state.Voters)
|
||||
|
||||
if len(state.NonVoters) > 0 {
|
||||
buffer.WriteString("Non Voters:\n")
|
||||
outputStringSlice(&buffer, " ", state.NonVoters)
|
||||
}
|
||||
|
||||
buffer.WriteString("Servers:\n")
|
||||
var outputs []mapOutput
|
||||
for id, srv := range state.Servers {
|
||||
outputs = append(outputs, mapOutput{key: id, value: formatServer(srv)})
|
||||
}
|
||||
|
||||
sort.Slice(outputs, func(i, j int) bool {
|
||||
return outputs[i].key < outputs[j].key
|
||||
})
|
||||
|
||||
for _, output := range outputs {
|
||||
buffer.WriteString(output.value)
|
||||
}
|
||||
|
||||
ui.Output(buffer.String())
|
||||
}
|
||||
|
||||
// An output formatter for table output of an object
|
||||
type TableFormatter struct{}
|
||||
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
package command
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mitchellh/cli"
|
||||
"github.com/posener/complete"
|
||||
)
|
||||
|
||||
var _ cli.Command = (*OperatorRaftAutopilotGetConfigCommand)(nil)
|
||||
var _ cli.CommandAutocomplete = (*OperatorRaftAutopilotGetConfigCommand)(nil)
|
||||
|
||||
type OperatorRaftAutopilotGetConfigCommand struct {
|
||||
*BaseCommand
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotGetConfigCommand) Synopsis() string {
|
||||
return "Returns the configuration of the autopilot subsystem under integrated storage"
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotGetConfigCommand) Help() string {
|
||||
helpText := `
|
||||
Usage: vault operator raft autopilot get-config
|
||||
|
||||
Returns the configuration of the autopilot subsystem under integrated storage.
|
||||
` + c.Flags().Help()
|
||||
|
||||
return strings.TrimSpace(helpText)
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotGetConfigCommand) Flags() *FlagSets {
|
||||
set := c.flagSet(FlagSetHTTP | FlagSetOutputFormat)
|
||||
|
||||
return set
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotGetConfigCommand) AutocompleteArgs() complete.Predictor {
|
||||
return complete.PredictAnything
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotGetConfigCommand) AutocompleteFlags() complete.Flags {
|
||||
return c.Flags().Completions()
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotGetConfigCommand) Run(args []string) int {
|
||||
f := c.Flags()
|
||||
|
||||
if err := f.Parse(args); err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 1
|
||||
}
|
||||
|
||||
args = f.Args()
|
||||
switch len(args) {
|
||||
case 0:
|
||||
default:
|
||||
c.UI.Error(fmt.Sprintf("Incorrect arguments (expected 0, got %d)", len(args)))
|
||||
return 1
|
||||
}
|
||||
|
||||
client, err := c.Client()
|
||||
if err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 2
|
||||
}
|
||||
|
||||
config, err := client.Sys().RaftAutopilotConfiguration()
|
||||
if err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 2
|
||||
}
|
||||
|
||||
if config == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
if Format(c.UI) != "table" {
|
||||
return OutputData(c.UI, config)
|
||||
}
|
||||
|
||||
entries := []string{"Key | Value"}
|
||||
entries = append(entries, fmt.Sprintf("%s | %t", "Cleanup Dead Servers", config.CleanupDeadServers))
|
||||
entries = append(entries, fmt.Sprintf("%s | %s", "Last Contact Threshold", config.LastContactThreshold.String()))
|
||||
entries = append(entries, fmt.Sprintf("%s | %s", "Dead Server Last Contact Threshold", config.DeadServerLastContactThreshold.String()))
|
||||
entries = append(entries, fmt.Sprintf("%s | %s", "Server Stabilization Time", config.ServerStabilizationTime.String()))
|
||||
entries = append(entries, fmt.Sprintf("%s | %d", "Min Quorum", config.MinQuorum))
|
||||
entries = append(entries, fmt.Sprintf("%s | %d", "Max Trailing Logs", config.MaxTrailingLogs))
|
||||
|
||||
return OutputData(c.UI, entries)
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
package command
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/mitchellh/cli"
|
||||
"github.com/posener/complete"
|
||||
)
|
||||
|
||||
var _ cli.Command = (*OperatorRaftAutopilotSetConfigCommand)(nil)
|
||||
var _ cli.CommandAutocomplete = (*OperatorRaftAutopilotSetConfigCommand)(nil)
|
||||
|
||||
type OperatorRaftAutopilotSetConfigCommand struct {
|
||||
*BaseCommand
|
||||
flagCleanupDeadServers BoolPtr
|
||||
flagLastContactThreshold time.Duration
|
||||
flagDeadServerLastContactThreshold time.Duration
|
||||
flagMaxTrailingLogs uint64
|
||||
flagMinQuorum uint
|
||||
flagServerStabilizationTime time.Duration
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotSetConfigCommand) Synopsis() string {
|
||||
return "Modify the configuration of the autopilot subsystem under integrated storage"
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotSetConfigCommand) Help() string {
|
||||
helpText := `
|
||||
Usage: vault operator raft autopilot set-config [options]
|
||||
|
||||
Modify the configuration of the autopilot subsystem under integrated storage.
|
||||
` + c.Flags().Help()
|
||||
|
||||
return strings.TrimSpace(helpText)
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotSetConfigCommand) Flags() *FlagSets {
|
||||
set := c.flagSet(FlagSetHTTP | FlagSetOutputFormat)
|
||||
|
||||
f := set.NewFlagSet("Common Options")
|
||||
|
||||
f.BoolPtrVar(&BoolPtrVar{
|
||||
Name: "cleanup-dead-servers",
|
||||
Target: &c.flagCleanupDeadServers,
|
||||
})
|
||||
|
||||
f.DurationVar(&DurationVar{
|
||||
Name: "last-contact-threshold",
|
||||
Target: &c.flagLastContactThreshold,
|
||||
})
|
||||
|
||||
f.DurationVar(&DurationVar{
|
||||
Name: "dead-server-last-contact-threshold",
|
||||
Target: &c.flagDeadServerLastContactThreshold,
|
||||
})
|
||||
|
||||
f.Uint64Var(&Uint64Var{
|
||||
Name: "max-trailing-logs",
|
||||
Target: &c.flagMaxTrailingLogs,
|
||||
})
|
||||
|
||||
f.UintVar(&UintVar{
|
||||
Name: "min-quorum",
|
||||
Target: &c.flagMinQuorum,
|
||||
})
|
||||
|
||||
f.DurationVar(&DurationVar{
|
||||
Name: "server-stabilization-time",
|
||||
Target: &c.flagServerStabilizationTime,
|
||||
})
|
||||
|
||||
return set
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotSetConfigCommand) AutocompleteArgs() complete.Predictor {
|
||||
return complete.PredictAnything
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotSetConfigCommand) AutocompleteFlags() complete.Flags {
|
||||
return c.Flags().Completions()
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotSetConfigCommand) Run(args []string) int {
|
||||
f := c.Flags()
|
||||
|
||||
if err := f.Parse(args); err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 1
|
||||
}
|
||||
|
||||
args = f.Args()
|
||||
switch len(args) {
|
||||
case 0:
|
||||
default:
|
||||
c.UI.Error(fmt.Sprintf("Incorrect arguments (expected 0, got %d)", len(args)))
|
||||
return 1
|
||||
}
|
||||
|
||||
client, err := c.Client()
|
||||
if err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 2
|
||||
}
|
||||
|
||||
data := make(map[string]interface{})
|
||||
if c.flagCleanupDeadServers.IsSet() {
|
||||
data["cleanup_dead_servers"] = c.flagCleanupDeadServers.Get()
|
||||
}
|
||||
if c.flagMaxTrailingLogs > 0 {
|
||||
data["max_trailing_logs"] = c.flagMaxTrailingLogs
|
||||
}
|
||||
if c.flagMinQuorum > 0 {
|
||||
data["min_quorum"] = c.flagMinQuorum
|
||||
}
|
||||
if c.flagLastContactThreshold > 0 {
|
||||
data["last_contact_threshold"] = c.flagLastContactThreshold.String()
|
||||
}
|
||||
if c.flagDeadServerLastContactThreshold > 0 {
|
||||
data["dead_server_last_contact_threshold"] = c.flagDeadServerLastContactThreshold.String()
|
||||
}
|
||||
if c.flagServerStabilizationTime > 0 {
|
||||
data["server_stabilization_time"] = c.flagServerStabilizationTime.String()
|
||||
}
|
||||
|
||||
secret, err := client.Logical().Write("sys/storage/raft/autopilot/configuration", data)
|
||||
if err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 2
|
||||
}
|
||||
if secret == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return OutputSecret(c.UI, secret)
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package command
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mitchellh/cli"
|
||||
"github.com/posener/complete"
|
||||
)
|
||||
|
||||
var _ cli.Command = (*OperatorRaftAutopilotStateCommand)(nil)
|
||||
var _ cli.CommandAutocomplete = (*OperatorRaftAutopilotStateCommand)(nil)
|
||||
|
||||
type OperatorRaftAutopilotStateCommand struct {
|
||||
*BaseCommand
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotStateCommand) Synopsis() string {
|
||||
return "Displays the state of the raft cluster under integrated storage as seen by autopilot"
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotStateCommand) Help() string {
|
||||
helpText := `
|
||||
Usage: vault operator raft autopilot state
|
||||
|
||||
Displays the state of the raft cluster under integrated storage as seen by autopilot.
|
||||
` + c.Flags().Help()
|
||||
|
||||
return strings.TrimSpace(helpText)
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotStateCommand) Flags() *FlagSets {
|
||||
set := c.flagSet(FlagSetHTTP | FlagSetOutputFormat)
|
||||
|
||||
// The output of the state endpoint contains nested values and is not fit for
|
||||
// the default "table" display format. Override the default display format to
|
||||
// "pretty", both in the flag and in the UI.
|
||||
set.mainSet.VisitAll(func(fl *flag.Flag) {
|
||||
if fl.Name == "format" {
|
||||
fl.DefValue = "pretty"
|
||||
}
|
||||
})
|
||||
ui, ok := c.UI.(*VaultUI)
|
||||
if ok && ui.format == "table" {
|
||||
ui.format = "pretty"
|
||||
}
|
||||
return set
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotStateCommand) AutocompleteArgs() complete.Predictor {
|
||||
return complete.PredictAnything
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotStateCommand) AutocompleteFlags() complete.Flags {
|
||||
return c.Flags().Completions()
|
||||
}
|
||||
|
||||
func (c *OperatorRaftAutopilotStateCommand) Run(args []string) int {
|
||||
f := c.Flags()
|
||||
|
||||
if err := f.Parse(args); err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 1
|
||||
}
|
||||
|
||||
args = f.Args()
|
||||
switch len(args) {
|
||||
case 0:
|
||||
default:
|
||||
c.UI.Error(fmt.Sprintf("Incorrect arguments (expected 0, got %d)", len(args)))
|
||||
return 1
|
||||
}
|
||||
|
||||
client, err := c.Client()
|
||||
if err != nil {
|
||||
c.UI.Error(err.Error())
|
||||
return 2
|
||||
}
|
||||
|
||||
state, err := client.Sys().RaftAutopilotState()
|
||||
if err != nil {
|
||||
c.UI.Error(fmt.Sprintf("Error checking autopilot state: %s", err))
|
||||
return 2
|
||||
}
|
||||
|
||||
if state == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return OutputData(c.UI, state)
|
||||
}
|
|
@ -895,3 +895,36 @@ func testLoadConfigFileLeaseMetrics(t *testing.T) {
|
|||
t.Fatal(diff)
|
||||
}
|
||||
}
|
||||
|
||||
func testConfigRaftAutopilot(t *testing.T) {
|
||||
config, err := LoadConfigFile("./test-fixtures/raft_autopilot.hcl")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
autopilotConfig := `[{"cleanup_dead_servers":true,"last_contact_threshold":"500ms","max_trailing_logs":250,"min_quorum":3,"server_stabilization_time":"10s"}]`
|
||||
expected := &Config{
|
||||
SharedConfig: &configutil.SharedConfig{
|
||||
Listeners: []*configutil.Listener{
|
||||
{
|
||||
Type: "tcp",
|
||||
Address: "127.0.0.1:8200",
|
||||
},
|
||||
},
|
||||
DisableMlock: true,
|
||||
},
|
||||
|
||||
Storage: &Storage{
|
||||
Type: "raft",
|
||||
Config: map[string]string{
|
||||
"path": "/storage/path/raft",
|
||||
"node_id": "raft1",
|
||||
"autopilot": autopilotConfig,
|
||||
},
|
||||
},
|
||||
}
|
||||
config.Listeners[0].RawConfig = nil
|
||||
if diff := deep.Equal(config, expected); diff != nil {
|
||||
t.Fatal(diff)
|
||||
}
|
||||
}
|
||||
|
|
4
go.mod
4
go.mod
|
@ -74,7 +74,8 @@ require (
|
|||
github.com/hashicorp/golang-lru v0.5.4
|
||||
github.com/hashicorp/hcl v1.0.1-vault
|
||||
github.com/hashicorp/nomad/api v0.0.0-20191220223628-edc62acd919d
|
||||
github.com/hashicorp/raft v1.1.3-0.20201002073007-f367681f9c48
|
||||
github.com/hashicorp/raft v1.2.0
|
||||
github.com/hashicorp/raft-autopilot v0.1.2
|
||||
github.com/hashicorp/raft-snapshot v1.0.3
|
||||
github.com/hashicorp/serf v0.9.5 // indirect
|
||||
github.com/hashicorp/vault-plugin-auth-alicloud v0.7.0
|
||||
|
@ -155,6 +156,7 @@ require (
|
|||
go.etcd.io/etcd v0.5.0-alpha.5.0.20200425165423-262c93980547
|
||||
go.mongodb.org/mongo-driver v1.4.6
|
||||
go.uber.org/atomic v1.6.0
|
||||
go.uber.org/zap v1.14.1 // indirect
|
||||
golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83
|
||||
golang.org/x/net v0.0.0-20201110031124-69a78807bb2b
|
||||
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
|
||||
|
|
6
go.sum
6
go.sum
|
@ -652,6 +652,10 @@ github.com/hashicorp/raft v1.0.1/go.mod h1:DVSAWItjLjTOkVbSpWQ0j0kUADIvDaCtBxIcb
|
|||
github.com/hashicorp/raft v1.1.2-0.20191002163536-9c6bd3e3eb17/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
||||
github.com/hashicorp/raft v1.1.3-0.20201002073007-f367681f9c48 h1:TpaG+HAdfQyreWNaxIlMU6myVKo2ciBDFdRyc+Z90OI=
|
||||
github.com/hashicorp/raft v1.1.3-0.20201002073007-f367681f9c48/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
||||
github.com/hashicorp/raft v1.2.0 h1:mHzHIrF0S91d3A7RPBvuqkgB4d/7oFJZyvf1Q4m7GA0=
|
||||
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
||||
github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs=
|
||||
github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
|
||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
|
||||
github.com/hashicorp/raft-snapshot v1.0.3 h1:lTgBBGMFcuKBTwHqWZ4r0TLzNsqo/OByCga/kM6F0uM=
|
||||
github.com/hashicorp/raft-snapshot v1.0.3/go.mod h1:5sL9eUn72lH5DzsFIJ9jaysITbHksSSszImWSOTC8Ic=
|
||||
|
@ -1214,6 +1218,7 @@ go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
|
|||
go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
|
||||
go.uber.org/atomic v1.6.0 h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk=
|
||||
go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
|
||||
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
|
||||
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
|
||||
go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
|
||||
go.uber.org/multierr v1.5.0 h1:KCa4XfM8CWFCpxXRGok+Q0SS/0XBhMDbHHGABQLvD2A=
|
||||
|
@ -1463,6 +1468,7 @@ golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtn
|
|||
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
|
|
|
@ -79,11 +79,7 @@ func MakeFileBackend(t testing.T, logger hclog.Logger) *vault.PhysicalBackendBun
|
|||
}
|
||||
}
|
||||
|
||||
func MakeRaftBackend(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
return MakeRaftBackendWithConf(t, coreIdx, logger, nil)
|
||||
}
|
||||
|
||||
func MakeRaftBackendWithConf(t testing.T, coreIdx int, logger hclog.Logger, extraConf map[string]string) *vault.PhysicalBackendBundle {
|
||||
func MakeRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, extraConf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
nodeID := fmt.Sprintf("core-%d", coreIdx)
|
||||
raftDir, err := ioutil.TempDir("", "vault-raft-")
|
||||
if err != nil {
|
||||
|
@ -102,7 +98,10 @@ func MakeRaftBackendWithConf(t testing.T, coreIdx int, logger hclog.Logger, extr
|
|||
"performance_multiplier": "8",
|
||||
}
|
||||
for k, v := range extraConf {
|
||||
conf[k] = v
|
||||
val, ok := v.(string)
|
||||
if ok {
|
||||
conf[k] = val
|
||||
}
|
||||
}
|
||||
|
||||
backend, err := raft.NewRaftBackend(conf, logger.Named("raft"))
|
||||
|
@ -120,11 +119,11 @@ func MakeRaftBackendWithConf(t testing.T, coreIdx int, logger hclog.Logger, extr
|
|||
// RaftHAFactory returns a PhysicalBackendBundle with raft set as the HABackend
|
||||
// and the physical.Backend provided in PhysicalBackendBundler as the storage
|
||||
// backend.
|
||||
func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
return func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
return func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
// Call the factory func to create the storage backend
|
||||
physFactory := SharedPhysicalFactory(f)
|
||||
bundle := physFactory(t, coreIdx, logger)
|
||||
bundle := physFactory(t, coreIdx, logger, nil)
|
||||
|
||||
// This can happen if a shared physical backend is called on a non-0th core.
|
||||
if bundle == nil {
|
||||
|
@ -137,14 +136,14 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg
|
|||
}
|
||||
|
||||
nodeID := fmt.Sprintf("core-%d", coreIdx)
|
||||
conf := map[string]string{
|
||||
backendConf := map[string]string{
|
||||
"path": raftDir,
|
||||
"node_id": nodeID,
|
||||
"performance_multiplier": "8",
|
||||
}
|
||||
|
||||
// Create and set the HA Backend
|
||||
raftBackend, err := raft.NewRaftBackend(conf, logger)
|
||||
raftBackend, err := raft.NewRaftBackend(backendConf, logger)
|
||||
if err != nil {
|
||||
bundle.Cleanup()
|
||||
t.Fatal(err)
|
||||
|
@ -166,8 +165,8 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg
|
|||
|
||||
type PhysicalBackendBundler func(t testing.T, logger hclog.Logger) *vault.PhysicalBackendBundle
|
||||
|
||||
func SharedPhysicalFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
return func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
func SharedPhysicalFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
return func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
if coreIdx == 0 {
|
||||
return f(t, logger)
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ func MakeReusableStorage(t testing.T, logger hclog.Logger, bundle *vault.Physica
|
|||
IsRaft: false,
|
||||
|
||||
Setup: func(conf *vault.CoreConfig, opts *vault.TestClusterOptions) {
|
||||
opts.PhysicalFactory = func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
opts.PhysicalFactory = func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
if coreIdx == 0 {
|
||||
// We intentionally do not clone the backend's Cleanup func,
|
||||
// because we don't want it to be run until the entire test has
|
||||
|
@ -86,7 +86,7 @@ func MakeReusableRaftStorage(t testing.T, logger hclog.Logger, numCores int, add
|
|||
Setup: func(conf *vault.CoreConfig, opts *vault.TestClusterOptions) {
|
||||
conf.DisablePerformanceStandby = true
|
||||
opts.KeepStandbysSealed = true
|
||||
opts.PhysicalFactory = func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
opts.PhysicalFactory = func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
return makeReusableRaftBackend(t, coreIdx, logger, raftDirs[coreIdx], addressProvider, false)
|
||||
}
|
||||
},
|
||||
|
@ -125,7 +125,7 @@ func MakeReusableRaftHAStorage(t testing.T, logger hclog.Logger, numCores int, b
|
|||
storage := ReusableStorage{
|
||||
Setup: func(conf *vault.CoreConfig, opts *vault.TestClusterOptions) {
|
||||
opts.KeepStandbysSealed = true
|
||||
opts.PhysicalFactory = func(t testing.T, coreIdx int, logger hclog.Logger) *vault.PhysicalBackendBundle {
|
||||
opts.PhysicalFactory = func(t testing.T, coreIdx int, logger hclog.Logger, conf map[string]interface{}) *vault.PhysicalBackendBundle {
|
||||
haBundle := makeReusableRaftBackend(t, coreIdx, logger, raftDirs[coreIdx], nil, true)
|
||||
|
||||
return &vault.PhysicalBackendBundle{
|
||||
|
|
|
@ -38,10 +38,11 @@ const (
|
|||
|
||||
var (
|
||||
// dataBucketName is the value we use for the bucket
|
||||
dataBucketName = []byte("data")
|
||||
configBucketName = []byte("config")
|
||||
latestIndexKey = []byte("latest_indexes")
|
||||
latestConfigKey = []byte("latest_config")
|
||||
dataBucketName = []byte("data")
|
||||
configBucketName = []byte("config")
|
||||
latestIndexKey = []byte("latest_indexes")
|
||||
latestConfigKey = []byte("latest_config")
|
||||
localNodeConfigKey = []byte("local_node_config")
|
||||
)
|
||||
|
||||
// Verify FSM satisfies the correct interfaces
|
||||
|
@ -86,10 +87,13 @@ type FSM struct {
|
|||
restoreCb restoreCallback
|
||||
|
||||
chunker *raftchunking.ChunkingBatchingFSM
|
||||
|
||||
localID string
|
||||
desiredSuffrage string
|
||||
}
|
||||
|
||||
// NewFSM constructs a FSM using the given directory
|
||||
func NewFSM(path string, logger log.Logger) (*FSM, error) {
|
||||
func NewFSM(path string, localID string, logger log.Logger) (*FSM, error) {
|
||||
|
||||
// Initialize the latest term, index, and config values
|
||||
latestTerm := new(uint64)
|
||||
|
@ -106,6 +110,11 @@ func NewFSM(path string, logger log.Logger) (*FSM, error) {
|
|||
latestTerm: latestTerm,
|
||||
latestIndex: latestIndex,
|
||||
latestConfig: latestConfig,
|
||||
// Assume that the default intent is to join as as voter. This will be updated
|
||||
// when this node joins a cluster with a different suffrage, or during cluster
|
||||
// setup if this is already part of a cluster with a desired suffrage.
|
||||
desiredSuffrage: "voter",
|
||||
localID: localID,
|
||||
}
|
||||
|
||||
f.chunker = raftchunking.NewChunkingBatchingFSM(f, &FSMChunkStorage{
|
||||
|
@ -243,6 +252,113 @@ func writeSnapshotMetaToDB(metadata *raft.SnapshotMeta, db *bolt.DB) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (f *FSM) localNodeConfig() (*LocalNodeConfigValue, error) {
|
||||
var configBytes []byte
|
||||
if err := f.db.View(func(tx *bolt.Tx) error {
|
||||
value := tx.Bucket(configBucketName).Get(localNodeConfigKey)
|
||||
if value != nil {
|
||||
configBytes = make([]byte, len(value))
|
||||
copy(configBytes, value)
|
||||
}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if configBytes == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var lnConfig LocalNodeConfigValue
|
||||
if configBytes != nil {
|
||||
err := proto.Unmarshal(configBytes, &lnConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
f.desiredSuffrage = lnConfig.DesiredSuffrage
|
||||
return &lnConfig, nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f *FSM) upgradeLocalNodeConfig() error {
|
||||
// Read the local node config
|
||||
lnConfig, err := f.localNodeConfig()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Entry is already present. Get the suffrage value.
|
||||
if lnConfig != nil {
|
||||
f.desiredSuffrage = lnConfig.DesiredSuffrage
|
||||
return nil
|
||||
}
|
||||
|
||||
//
|
||||
// This is the upgrade case where there is no entry
|
||||
//
|
||||
|
||||
lnConfig = &LocalNodeConfigValue{}
|
||||
|
||||
// Refer to the persisted latest raft config
|
||||
config := f.latestConfig.Load().(*ConfigurationValue)
|
||||
|
||||
// If there is no config, then this is a fresh node coming up. This could end up
|
||||
// being a voter or non-voter. But by default assume that this is a voter. It
|
||||
// will be changed if this node joins the cluster as a non-voter.
|
||||
if config == nil {
|
||||
lnConfig.DesiredSuffrage = f.desiredSuffrage
|
||||
return f.persistDesiredSuffrage(lnConfig)
|
||||
}
|
||||
|
||||
// Get the last known suffrage of the node and assume that it is the desired
|
||||
// suffrage. There is no better alternative here.
|
||||
for _, srv := range config.Servers {
|
||||
if srv.Id == f.localID {
|
||||
switch srv.Suffrage {
|
||||
case int32(raft.Nonvoter):
|
||||
lnConfig.DesiredSuffrage = "non-voter"
|
||||
default:
|
||||
lnConfig.DesiredSuffrage = "voter"
|
||||
}
|
||||
// Bring the intent to the fsm instance.
|
||||
f.desiredSuffrage = lnConfig.DesiredSuffrage
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return f.persistDesiredSuffrage(lnConfig)
|
||||
}
|
||||
|
||||
// recordSuffrage is called when a node successfully joins the cluster. This
|
||||
// intent should land in the stored configuration. If the config isn't available
|
||||
// yet, we still go ahead and store the intent in the fsm. During the next
|
||||
// update to the configuration, this intent will be persisted.
|
||||
func (f *FSM) recordSuffrage(desiredSuffrage string) error {
|
||||
f.l.Lock()
|
||||
defer f.l.Unlock()
|
||||
|
||||
if err := f.persistDesiredSuffrage(&LocalNodeConfigValue{
|
||||
DesiredSuffrage: desiredSuffrage,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f.desiredSuffrage = desiredSuffrage
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *FSM) persistDesiredSuffrage(lnconfig *LocalNodeConfigValue) error {
|
||||
dsBytes, err := proto.Marshal(lnconfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return f.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(configBucketName).Put(localNodeConfigKey, dsBytes)
|
||||
})
|
||||
}
|
||||
|
||||
func (f *FSM) witnessSnapshot(metadata *raft.SnapshotMeta) error {
|
||||
f.l.RLock()
|
||||
defer f.l.RUnlock()
|
||||
|
@ -645,6 +761,12 @@ func (f *FSM) Restore(r io.ReadCloser) error {
|
|||
f.l.Lock()
|
||||
defer f.l.Unlock()
|
||||
|
||||
// Cache the local node config before closing the db file
|
||||
lnConfig, err := f.localNodeConfig()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Close the db file
|
||||
if err := f.db.Close(); err != nil {
|
||||
f.logger.Error("failed to close database file", "error", err)
|
||||
|
@ -671,6 +793,16 @@ func (f *FSM) Restore(r io.ReadCloser) error {
|
|||
retErr = multierror.Append(retErr, errwrap.Wrapf("failed to open new bolt file: {{err}}", err))
|
||||
}
|
||||
|
||||
// Handle local node config restore. lnConfig should not be nil here, but
|
||||
// adding the nil check anyways for safety.
|
||||
if lnConfig != nil {
|
||||
// Persist the local node config on the restored fsm.
|
||||
if err := f.persistDesiredSuffrage(lnConfig); err != nil {
|
||||
f.logger.Error("failed to persist local node config from before the restore", "error", err)
|
||||
retErr = multierror.Append(retErr, errwrap.Wrapf("failed to persist local node config from before the restore: {{err}}", err))
|
||||
}
|
||||
}
|
||||
|
||||
return retErr.ErrorOrNil()
|
||||
}
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ func getFSM(t testing.TB) (*FSM, string) {
|
|||
Level: hclog.Trace,
|
||||
})
|
||||
|
||||
fsm, err := NewFSM(raftDir, logger)
|
||||
fsm, err := NewFSM(raftDir, "", logger)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"github.com/hashicorp/go-raftchunking"
|
||||
"github.com/hashicorp/go-uuid"
|
||||
"github.com/hashicorp/raft"
|
||||
autopilot "github.com/hashicorp/raft-autopilot"
|
||||
snapshot "github.com/hashicorp/raft-snapshot"
|
||||
raftboltdb "github.com/hashicorp/vault/physical/raft/logstore"
|
||||
"github.com/hashicorp/vault/sdk/helper/consts"
|
||||
|
@ -120,6 +121,30 @@ type RaftBackend struct {
|
|||
// It is suggested to use a value of 2x the Raft chunking size for optimal
|
||||
// performance.
|
||||
maxEntrySize uint64
|
||||
|
||||
// autopilot is the instance of raft-autopilot library implementation of the
|
||||
// autopilot features. This will be instantiated in both leader and followers.
|
||||
// However, only active node will have a "running" autopilot.
|
||||
autopilot *autopilot.Autopilot
|
||||
|
||||
// autopilotConfig represents the configuration required to instantiate autopilot.
|
||||
autopilotConfig *AutopilotConfig
|
||||
|
||||
// followerStates represents the information about all the peers of the raft
|
||||
// leader. This is used to track some state of the peers and as well as used
|
||||
// to see if the peers are "alive" using the heartbeat received from them.
|
||||
followerStates *FollowerStates
|
||||
|
||||
// followerHeartbeatTicker is used to compute dead servers using follower
|
||||
// state heartbeats.
|
||||
followerHeartbeatTicker *time.Ticker
|
||||
|
||||
// disableAutopilot if set will not put autopilot implementation to use. The
|
||||
// fallback will be to interact with the raft instance directly. This can only
|
||||
// be set during startup via the environment variable
|
||||
// VAULT_RAFT_AUTOPILOT_DISABLE during startup and can't be updated once the
|
||||
// node is up and running.
|
||||
disableAutopilot bool
|
||||
}
|
||||
|
||||
// LeaderJoinInfo contains information required by a node to join itself as a
|
||||
|
@ -247,7 +272,6 @@ func EnsurePath(path string, dir bool) error {
|
|||
|
||||
// NewRaftBackend constructs a RaftBackend using the given directory
|
||||
func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend, error) {
|
||||
|
||||
path := os.Getenv(EnvVaultRaftPath)
|
||||
if path == "" {
|
||||
pathFromConfig, ok := conf["path"]
|
||||
|
@ -257,8 +281,50 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
|||
path = pathFromConfig
|
||||
}
|
||||
|
||||
var localID string
|
||||
{
|
||||
// Determine the local node ID from the environment.
|
||||
if raftNodeID := os.Getenv(EnvVaultRaftNodeID); raftNodeID != "" {
|
||||
localID = raftNodeID
|
||||
}
|
||||
|
||||
// If not set in the environment check the configuration file.
|
||||
if len(localID) == 0 {
|
||||
localID = conf["node_id"]
|
||||
}
|
||||
|
||||
// If not set in the config check the "node-id" file.
|
||||
if len(localID) == 0 {
|
||||
localIDRaw, err := ioutil.ReadFile(filepath.Join(path, "node-id"))
|
||||
switch {
|
||||
case err == nil:
|
||||
if len(localIDRaw) > 0 {
|
||||
localID = string(localIDRaw)
|
||||
}
|
||||
case os.IsNotExist(err):
|
||||
default:
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// If all of the above fails generate a UUID and persist it to the
|
||||
// "node-id" file.
|
||||
if len(localID) == 0 {
|
||||
id, err := uuid.GenerateUUID()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := ioutil.WriteFile(filepath.Join(path, "node-id"), []byte(id), 0600); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
localID = id
|
||||
}
|
||||
}
|
||||
|
||||
// Create the FSM.
|
||||
fsm, err := NewFSM(path, logger.Named("fsm"))
|
||||
fsm, err := NewFSM(path, localID, logger.Named("fsm"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create fsm: %v", err)
|
||||
}
|
||||
|
@ -322,48 +388,6 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
|||
snap = newSnapshotStoreDelay(snap, delay)
|
||||
}
|
||||
|
||||
var localID string
|
||||
{
|
||||
// Determine the local node ID from the environment.
|
||||
if raftNodeID := os.Getenv(EnvVaultRaftNodeID); raftNodeID != "" {
|
||||
localID = raftNodeID
|
||||
}
|
||||
|
||||
// If not set in the environment check the configuration file.
|
||||
if len(localID) == 0 {
|
||||
localID = conf["node_id"]
|
||||
}
|
||||
|
||||
// If not set in the config check the "node-id" file.
|
||||
if len(localID) == 0 {
|
||||
localIDRaw, err := ioutil.ReadFile(filepath.Join(path, "node-id"))
|
||||
switch {
|
||||
case err == nil:
|
||||
if len(localIDRaw) > 0 {
|
||||
localID = string(localIDRaw)
|
||||
}
|
||||
case os.IsNotExist(err):
|
||||
default:
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// If all of the above fails generate a UUID and persist it to the
|
||||
// "node-id" file.
|
||||
if len(localID) == 0 {
|
||||
id, err := uuid.GenerateUUID()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := ioutil.WriteFile(filepath.Join(path, "node-id"), []byte(id), 0600); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
localID = id
|
||||
}
|
||||
}
|
||||
|
||||
maxEntrySize := defaultMaxEntrySize
|
||||
if maxEntrySizeCfg := conf["max_entry_size"]; len(maxEntrySizeCfg) != 0 {
|
||||
i, err := strconv.Atoi(maxEntrySizeCfg)
|
||||
|
@ -375,17 +399,18 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
|||
}
|
||||
|
||||
return &RaftBackend{
|
||||
logger: logger,
|
||||
fsm: fsm,
|
||||
raftInitCh: make(chan struct{}),
|
||||
conf: conf,
|
||||
logStore: log,
|
||||
stableStore: stable,
|
||||
snapStore: snap,
|
||||
dataDir: path,
|
||||
localID: localID,
|
||||
permitPool: physical.NewPermitPool(physical.DefaultParallelOperations),
|
||||
maxEntrySize: maxEntrySize,
|
||||
logger: logger,
|
||||
fsm: fsm,
|
||||
raftInitCh: make(chan struct{}),
|
||||
conf: conf,
|
||||
logStore: log,
|
||||
stableStore: stable,
|
||||
snapStore: snap,
|
||||
dataDir: path,
|
||||
localID: localID,
|
||||
permitPool: physical.NewPermitPool(physical.DefaultParallelOperations),
|
||||
maxEntrySize: maxEntrySize,
|
||||
followerHeartbeatTicker: time.NewTicker(time.Second),
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -781,6 +806,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
|
|||
b.raft = raftObj
|
||||
b.raftNotifyCh = raftNotifyCh
|
||||
|
||||
if err := b.fsm.upgradeLocalNodeConfig(); err != nil {
|
||||
b.logger.Error("failed to upgrade local node configuration")
|
||||
return err
|
||||
}
|
||||
|
||||
if b.streamLayer != nil {
|
||||
// Add Handler to the cluster.
|
||||
opts.ClusterListener.AddHandler(consts.RaftStorageALPN, b.streamLayer)
|
||||
|
@ -852,19 +882,42 @@ func (b *RaftBackend) AppliedIndex() uint64 {
|
|||
return indexState.Index
|
||||
}
|
||||
|
||||
// Term returns the raft term of this node.
|
||||
func (b *RaftBackend) Term() uint64 {
|
||||
b.l.RLock()
|
||||
defer b.l.RUnlock()
|
||||
|
||||
if b.fsm == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
// We use the latest index that the FSM has seen here, which may be behind
|
||||
// raft.AppliedIndex() due to the async nature of the raft library.
|
||||
indexState, _ := b.fsm.LatestState()
|
||||
return indexState.Term
|
||||
}
|
||||
|
||||
// RemovePeer removes the given peer ID from the raft cluster. If the node is
|
||||
// ourselves we will give up leadership.
|
||||
func (b *RaftBackend) RemovePeer(ctx context.Context, peerID string) error {
|
||||
b.l.RLock()
|
||||
defer b.l.RUnlock()
|
||||
|
||||
if b.raft == nil {
|
||||
return errors.New("raft storage is not initialized")
|
||||
if b.disableAutopilot {
|
||||
if b.raft == nil {
|
||||
return errors.New("raft storage is not initialized")
|
||||
}
|
||||
b.logger.Trace("removing server from raft", "id", peerID)
|
||||
future := b.raft.RemoveServer(raft.ServerID(peerID), 0, 0)
|
||||
return future.Error()
|
||||
}
|
||||
|
||||
future := b.raft.RemoveServer(raft.ServerID(peerID), 0, 0)
|
||||
if b.autopilot == nil {
|
||||
return errors.New("raft storage autopilot is not initialized")
|
||||
}
|
||||
|
||||
return future.Error()
|
||||
b.logger.Trace("removing server from raft via autopilot", "id", peerID)
|
||||
return b.autopilot.RemoveServer(raft.ServerID(peerID))
|
||||
}
|
||||
|
||||
func (b *RaftBackend) GetConfiguration(ctx context.Context) (*RaftConfigurationResponse, error) {
|
||||
|
@ -905,14 +958,27 @@ func (b *RaftBackend) AddPeer(ctx context.Context, peerID, clusterAddr string) e
|
|||
b.l.RLock()
|
||||
defer b.l.RUnlock()
|
||||
|
||||
if b.raft == nil {
|
||||
return errors.New("raft storage is not initialized")
|
||||
if b.disableAutopilot {
|
||||
if b.raft == nil {
|
||||
return errors.New("raft storage is not initialized")
|
||||
}
|
||||
b.logger.Trace("adding server to raft", "id", peerID)
|
||||
future := b.raft.AddVoter(raft.ServerID(peerID), raft.ServerAddress(clusterAddr), 0, 0)
|
||||
return future.Error()
|
||||
}
|
||||
|
||||
b.logger.Debug("adding raft peer", "node_id", peerID, "cluster_addr", clusterAddr)
|
||||
if b.autopilot == nil {
|
||||
return errors.New("raft storage autopilot is not initialized")
|
||||
}
|
||||
|
||||
future := b.raft.AddVoter(raft.ServerID(peerID), raft.ServerAddress(clusterAddr), 0, 0)
|
||||
return future.Error()
|
||||
b.logger.Trace("adding server to raft via autopilot", "id", peerID)
|
||||
return b.autopilot.AddServer(&autopilot.Server{
|
||||
ID: raft.ServerID(peerID),
|
||||
Name: peerID,
|
||||
Address: raft.ServerAddress(clusterAddr),
|
||||
RaftVersion: raft.ProtocolVersionMax,
|
||||
NodeType: autopilot.NodeVoter,
|
||||
})
|
||||
}
|
||||
|
||||
// Peers returns all the servers present in the raft cluster
|
||||
|
@ -921,7 +987,7 @@ func (b *RaftBackend) Peers(ctx context.Context) ([]Peer, error) {
|
|||
defer b.l.RUnlock()
|
||||
|
||||
if b.raft == nil {
|
||||
return nil, errors.New("raft storage backend is not initialized")
|
||||
return nil, errors.New("raft storage is not initialized")
|
||||
}
|
||||
|
||||
future := b.raft.GetConfiguration()
|
||||
|
@ -957,7 +1023,7 @@ func (b *RaftBackend) Snapshot(out io.Writer, access *seal.Access) error {
|
|||
defer b.l.RUnlock()
|
||||
|
||||
if b.raft == nil {
|
||||
return errors.New("raft storage backend is sealed")
|
||||
return errors.New("raft storage is sealed")
|
||||
}
|
||||
|
||||
// If we have access to the seal create a sealer object
|
||||
|
@ -982,7 +1048,7 @@ func (b *RaftBackend) WriteSnapshotToTemp(in io.ReadCloser, access *seal.Access)
|
|||
|
||||
var metadata raft.SnapshotMeta
|
||||
if b.raft == nil {
|
||||
return nil, nil, metadata, errors.New("raft storage backend is sealed")
|
||||
return nil, nil, metadata, errors.New("raft storage is sealed")
|
||||
}
|
||||
|
||||
// If we have access to the seal create a sealer object
|
||||
|
@ -1150,7 +1216,7 @@ func (b *RaftBackend) Transaction(ctx context.Context, txns []*physical.TxnEntry
|
|||
// persisted to the local FSM. Caller should hold the backend's read lock.
|
||||
func (b *RaftBackend) applyLog(ctx context.Context, command *LogData) error {
|
||||
if b.raft == nil {
|
||||
return errors.New("raft storage backend is not initialized")
|
||||
return errors.New("raft storage is not initialized")
|
||||
}
|
||||
|
||||
commandBytes, err := proto.Marshal(command)
|
||||
|
@ -1222,6 +1288,35 @@ func (b *RaftBackend) LockWith(key, value string) (physical.Lock, error) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
// SetDesiredSuffrage sets a field in the fsm indicating the suffrage intent for
|
||||
// this node.
|
||||
func (b *RaftBackend) SetDesiredSuffrage(nonVoter bool) error {
|
||||
b.l.Lock()
|
||||
defer b.l.Unlock()
|
||||
|
||||
var desiredSuffrage string
|
||||
switch nonVoter {
|
||||
case true:
|
||||
desiredSuffrage = "non-voter"
|
||||
default:
|
||||
desiredSuffrage = "voter"
|
||||
}
|
||||
|
||||
err := b.fsm.recordSuffrage(desiredSuffrage)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *RaftBackend) DesiredSuffrage() string {
|
||||
b.l.RLock()
|
||||
desiredSuffrage := b.fsm.desiredSuffrage
|
||||
b.l.RUnlock()
|
||||
return desiredSuffrage
|
||||
}
|
||||
|
||||
// RaftLock implements the physical Lock interface and enables HA for this
|
||||
// backend. The Lock uses the raftNotifyCh for receiving leadership edge
|
||||
// triggers. Vault's active duty matches raft's leadership.
|
||||
|
@ -1327,8 +1422,6 @@ func (l *RaftLock) Lock(stopCh <-chan struct{}) (<-chan struct{}, error) {
|
|||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Unlock gives up leadership.
|
||||
|
|
|
@ -0,0 +1,711 @@
|
|||
package raft
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/helper/parseutil"
|
||||
"github.com/hashicorp/vault/sdk/helper/strutil"
|
||||
"go.uber.org/atomic"
|
||||
|
||||
metrics "github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/raft"
|
||||
autopilot "github.com/hashicorp/raft-autopilot"
|
||||
"github.com/mitchellh/mapstructure"
|
||||
)
|
||||
|
||||
type CleanupDeadServersValue int
|
||||
|
||||
const (
|
||||
CleanupDeadServersUnset CleanupDeadServersValue = 0
|
||||
CleanupDeadServersTrue CleanupDeadServersValue = 1
|
||||
CleanupDeadServersFalse CleanupDeadServersValue = 2
|
||||
)
|
||||
|
||||
func (c CleanupDeadServersValue) Value() bool {
|
||||
switch c {
|
||||
case CleanupDeadServersTrue:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// AutopilotConfig is used for querying/setting the Autopilot configuration.
|
||||
type AutopilotConfig struct {
|
||||
// CleanupDeadServers controls whether to remove dead servers from the Raft
|
||||
// peer list periodically or when a new server joins.
|
||||
CleanupDeadServers bool `mapstructure:"cleanup_dead_servers"`
|
||||
|
||||
// CleanupDeadServersValue is used to shadow the CleanupDeadServers field in
|
||||
// storage. Having it as an int helps in knowing if the value was set explicitly
|
||||
// using the API or not.
|
||||
CleanupDeadServersValue CleanupDeadServersValue `mapstructure:"cleanup_dead_servers_value"`
|
||||
|
||||
// LastContactThreshold is the limit on the amount of time a server can go
|
||||
// without leader contact before being considered unhealthy.
|
||||
LastContactThreshold time.Duration `mapstructure:"-"`
|
||||
|
||||
// DeadServerLastContactThreshold is the limit on the amount of time a server
|
||||
// can go without leader contact before being considered failed. This takes
|
||||
// effect only when CleanupDeadServers is set.
|
||||
DeadServerLastContactThreshold time.Duration `mapstructure:"-"`
|
||||
|
||||
// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
|
||||
// be behind before being considered unhealthy.
|
||||
MaxTrailingLogs uint64 `mapstructure:"max_trailing_logs"`
|
||||
|
||||
// MinQuorum sets the minimum number of servers allowed in a cluster before
|
||||
// autopilot can prune dead servers.
|
||||
MinQuorum uint `mapstructure:"min_quorum"`
|
||||
|
||||
// ServerStabilizationTime is the minimum amount of time a server must be in a
|
||||
// stable, healthy state before it can be added to the cluster. Only applicable
|
||||
// with Raft protocol version 3 or higher.
|
||||
ServerStabilizationTime time.Duration `mapstructure:"-"`
|
||||
}
|
||||
|
||||
// Merge combines the supplied config with the receiver. Supplied ones take
|
||||
// priority.
|
||||
func (to *AutopilotConfig) Merge(from *AutopilotConfig) {
|
||||
if from == nil {
|
||||
return
|
||||
}
|
||||
if from.CleanupDeadServersValue != CleanupDeadServersUnset {
|
||||
to.CleanupDeadServers = from.CleanupDeadServersValue.Value()
|
||||
}
|
||||
if from.MinQuorum != 0 {
|
||||
to.MinQuorum = from.MinQuorum
|
||||
}
|
||||
if from.LastContactThreshold != 0 {
|
||||
to.LastContactThreshold = from.LastContactThreshold
|
||||
}
|
||||
if from.DeadServerLastContactThreshold != 0 {
|
||||
to.DeadServerLastContactThreshold = from.DeadServerLastContactThreshold
|
||||
}
|
||||
if from.MaxTrailingLogs != 0 {
|
||||
to.MaxTrailingLogs = from.MaxTrailingLogs
|
||||
}
|
||||
if from.ServerStabilizationTime != 0 {
|
||||
to.ServerStabilizationTime = from.ServerStabilizationTime
|
||||
}
|
||||
}
|
||||
|
||||
// Clone returns a duplicate instance of AutopilotConfig with the exact same values.
|
||||
func (ac *AutopilotConfig) Clone() *AutopilotConfig {
|
||||
if ac == nil {
|
||||
return nil
|
||||
}
|
||||
return &AutopilotConfig{
|
||||
CleanupDeadServers: ac.CleanupDeadServers,
|
||||
LastContactThreshold: ac.LastContactThreshold,
|
||||
DeadServerLastContactThreshold: ac.DeadServerLastContactThreshold,
|
||||
MaxTrailingLogs: ac.MaxTrailingLogs,
|
||||
MinQuorum: ac.MinQuorum,
|
||||
ServerStabilizationTime: ac.ServerStabilizationTime,
|
||||
}
|
||||
}
|
||||
|
||||
// MarshalJSON makes the autopilot config fields JSON compatible
|
||||
func (ac *AutopilotConfig) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(map[string]interface{}{
|
||||
"cleanup_dead_servers": ac.CleanupDeadServers,
|
||||
"cleanup_dead_servers_value": ac.CleanupDeadServersValue,
|
||||
"last_contact_threshold": ac.LastContactThreshold.String(),
|
||||
"dead_server_last_contact_threshold": ac.DeadServerLastContactThreshold.String(),
|
||||
"max_trailing_logs": ac.MaxTrailingLogs,
|
||||
"min_quorum": ac.MinQuorum,
|
||||
"server_stabilization_time": ac.ServerStabilizationTime.String(),
|
||||
})
|
||||
}
|
||||
|
||||
// UnmarshalJSON parses the autopilot config JSON blob
|
||||
func (ac *AutopilotConfig) UnmarshalJSON(b []byte) error {
|
||||
var data interface{}
|
||||
err := json.Unmarshal(b, &data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
conf := data.(map[string]interface{})
|
||||
if err = mapstructure.WeakDecode(conf, ac); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.LastContactThreshold, err = parseutil.ParseDurationSecond(conf["last_contact_threshold"]); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.DeadServerLastContactThreshold, err = parseutil.ParseDurationSecond(conf["dead_server_last_contact_threshold"]); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.ServerStabilizationTime, err = parseutil.ParseDurationSecond(conf["server_stabilization_time"]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// FollowerState represents the information about peer that the leader tracks.
|
||||
type FollowerState struct {
|
||||
AppliedIndex uint64
|
||||
LastHeartbeat time.Time
|
||||
LastTerm uint64
|
||||
IsDead *atomic.Bool
|
||||
DesiredSuffrage string
|
||||
}
|
||||
|
||||
// FollowerStates holds information about all the followers in the raft cluster
|
||||
// tracked by the leader.
|
||||
type FollowerStates struct {
|
||||
l sync.RWMutex
|
||||
followers map[string]*FollowerState
|
||||
}
|
||||
|
||||
// NewFollowerStates creates a new FollowerStates object
|
||||
func NewFollowerStates() *FollowerStates {
|
||||
return &FollowerStates{
|
||||
followers: make(map[string]*FollowerState),
|
||||
}
|
||||
}
|
||||
|
||||
// Update the peer information in the follower states
|
||||
func (s *FollowerStates) Update(nodeID string, appliedIndex uint64, term uint64, desiredSuffrage string) {
|
||||
s.l.Lock()
|
||||
defer s.l.Unlock()
|
||||
|
||||
state, ok := s.followers[nodeID]
|
||||
if !ok {
|
||||
state = &FollowerState{
|
||||
IsDead: atomic.NewBool(false),
|
||||
}
|
||||
s.followers[nodeID] = state
|
||||
}
|
||||
|
||||
state.AppliedIndex = appliedIndex
|
||||
state.LastTerm = term
|
||||
state.DesiredSuffrage = desiredSuffrage
|
||||
state.LastHeartbeat = time.Now()
|
||||
}
|
||||
|
||||
// Clear wipes all the information regarding peers in the follower states.
|
||||
func (s *FollowerStates) Clear() {
|
||||
s.l.Lock()
|
||||
for i := range s.followers {
|
||||
delete(s.followers, i)
|
||||
}
|
||||
s.l.Unlock()
|
||||
}
|
||||
|
||||
// Delete the entry of a peer represented by the nodeID from follower states.
|
||||
func (s *FollowerStates) Delete(nodeID string) {
|
||||
s.l.Lock()
|
||||
delete(s.followers, nodeID)
|
||||
s.l.Unlock()
|
||||
}
|
||||
|
||||
// MinIndex returns the minimum raft index applied in the raft cluster.
|
||||
func (s *FollowerStates) MinIndex() uint64 {
|
||||
var min uint64 = math.MaxUint64
|
||||
minFunc := func(a, b uint64) uint64 {
|
||||
if a > b {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
s.l.RLock()
|
||||
for _, state := range s.followers {
|
||||
min = minFunc(min, state.AppliedIndex)
|
||||
}
|
||||
s.l.RUnlock()
|
||||
|
||||
if min == math.MaxUint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return min
|
||||
}
|
||||
|
||||
// Ensure that the Delegate implements the ApplicationIntegration interface
|
||||
var _ autopilot.ApplicationIntegration = (*Delegate)(nil)
|
||||
|
||||
// Delegate is an implementation of autopilot.ApplicationIntegration interface.
|
||||
// This is used by the autopilot library to retrieve information and to have
|
||||
// application specific tasks performed.
|
||||
type Delegate struct {
|
||||
*RaftBackend
|
||||
|
||||
// dl is a lock dedicated for guarding delegate's fields
|
||||
dl sync.RWMutex
|
||||
inflightRemovals map[raft.ServerID]bool
|
||||
}
|
||||
|
||||
func newDelegate(b *RaftBackend) *Delegate {
|
||||
return &Delegate{
|
||||
RaftBackend: b,
|
||||
inflightRemovals: make(map[raft.ServerID]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// AutopilotConfig is called by the autopilot library to know the desired
|
||||
// autopilot configuration.
|
||||
func (d *Delegate) AutopilotConfig() *autopilot.Config {
|
||||
d.l.RLock()
|
||||
config := &autopilot.Config{
|
||||
CleanupDeadServers: d.autopilotConfig.CleanupDeadServers,
|
||||
LastContactThreshold: d.autopilotConfig.LastContactThreshold,
|
||||
MaxTrailingLogs: d.autopilotConfig.MaxTrailingLogs,
|
||||
MinQuorum: d.autopilotConfig.MinQuorum,
|
||||
ServerStabilizationTime: d.autopilotConfig.ServerStabilizationTime,
|
||||
Ext: d.autopilotConfigExt(),
|
||||
}
|
||||
d.l.RUnlock()
|
||||
return config
|
||||
}
|
||||
|
||||
// NotifyState is called by the autopilot library whenever there is a state
|
||||
// change. We update a few metrics when this happens.
|
||||
func (d *Delegate) NotifyState(state *autopilot.State) {
|
||||
if d.raft.State() == raft.Leader {
|
||||
metrics.SetGauge([]string{"autopilot", "failure_tolerance"}, float32(state.FailureTolerance))
|
||||
if state.Healthy {
|
||||
metrics.SetGauge([]string{"autopilot", "healthy"}, 1)
|
||||
} else {
|
||||
metrics.SetGauge([]string{"autopilot", "healthy"}, 0)
|
||||
}
|
||||
|
||||
for id, state := range state.Servers {
|
||||
labels := []metrics.Label{
|
||||
{
|
||||
Name: "node_id",
|
||||
Value: string(id),
|
||||
},
|
||||
}
|
||||
if state.Health.Healthy {
|
||||
metrics.SetGaugeWithLabels([]string{"autopilot", "node", "healthy"}, 1, labels)
|
||||
} else {
|
||||
metrics.SetGaugeWithLabels([]string{"autopilot", "node", "healthy"}, 0, labels)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FetchServerStats is called by the autopilot library to retrieve information
|
||||
// about all the nodes in the raft cluster.
|
||||
func (d *Delegate) FetchServerStats(ctx context.Context, servers map[raft.ServerID]*autopilot.Server) map[raft.ServerID]*autopilot.ServerStats {
|
||||
ret := make(map[raft.ServerID]*autopilot.ServerStats)
|
||||
|
||||
d.l.RLock()
|
||||
followerStates := d.followerStates
|
||||
d.l.RUnlock()
|
||||
|
||||
followerStates.l.RLock()
|
||||
defer followerStates.l.RUnlock()
|
||||
|
||||
now := time.Now()
|
||||
for id, followerState := range followerStates.followers {
|
||||
ret[raft.ServerID(id)] = &autopilot.ServerStats{
|
||||
LastContact: now.Sub(followerState.LastHeartbeat),
|
||||
LastTerm: followerState.LastTerm,
|
||||
LastIndex: followerState.AppliedIndex,
|
||||
}
|
||||
}
|
||||
|
||||
leaderState, _ := d.fsm.LatestState()
|
||||
ret[raft.ServerID(d.localID)] = &autopilot.ServerStats{
|
||||
LastTerm: leaderState.Term,
|
||||
LastIndex: leaderState.Index,
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// KnownServers is called by the autopilot library to know the status of each
|
||||
// node in the raft cluster. If the application thinks that certain nodes left,
|
||||
// it is here that we let the autopilot library know of the same.
|
||||
func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
||||
d.l.RLock()
|
||||
defer d.l.RUnlock()
|
||||
future := d.raft.GetConfiguration()
|
||||
if err := future.Error(); err != nil {
|
||||
d.logger.Error("failed to get raft configuration when computing known servers", "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
servers := future.Configuration().Servers
|
||||
serverIDs := make([]string, 0, len(servers))
|
||||
for _, server := range servers {
|
||||
serverIDs = append(serverIDs, string(server.ID))
|
||||
}
|
||||
|
||||
d.followerStates.l.RLock()
|
||||
defer d.followerStates.l.RUnlock()
|
||||
|
||||
ret := make(map[raft.ServerID]*autopilot.Server)
|
||||
for id, state := range d.followerStates.followers {
|
||||
// If the server is not in raft configuration, even if we received a follower
|
||||
// heartbeat, it shouldn't be a known server for autopilot.
|
||||
if !strutil.StrListContains(serverIDs, id) {
|
||||
continue
|
||||
}
|
||||
|
||||
server := &autopilot.Server{
|
||||
ID: raft.ServerID(id),
|
||||
Name: id,
|
||||
RaftVersion: raft.ProtocolVersionMax,
|
||||
Ext: d.autopilotServerExt(state.DesiredSuffrage),
|
||||
}
|
||||
|
||||
switch state.IsDead.Load() {
|
||||
case true:
|
||||
d.logger.Debug("informing autopilot that the node left", "id", id)
|
||||
server.NodeStatus = autopilot.NodeLeft
|
||||
default:
|
||||
server.NodeStatus = autopilot.NodeAlive
|
||||
}
|
||||
|
||||
ret[raft.ServerID(id)] = server
|
||||
}
|
||||
|
||||
// Add the leader
|
||||
ret[raft.ServerID(d.localID)] = &autopilot.Server{
|
||||
ID: raft.ServerID(d.localID),
|
||||
Name: d.localID,
|
||||
RaftVersion: raft.ProtocolVersionMax,
|
||||
NodeStatus: autopilot.NodeAlive,
|
||||
Ext: d.autopilotServerExt("voter"),
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// RemoveFailedServer is called by the autopilot library when it desires a node
|
||||
// to be removed from the raft configuration. This function removes the node
|
||||
// from the raft cluster and stops tracking its information in follower states.
|
||||
// This function needs to return quickly. Hence removal is performed in a
|
||||
// goroutine.
|
||||
func (d *Delegate) RemoveFailedServer(server *autopilot.Server) {
|
||||
go func() {
|
||||
added := false
|
||||
defer func() {
|
||||
if added {
|
||||
d.dl.Lock()
|
||||
delete(d.inflightRemovals, server.ID)
|
||||
d.dl.Unlock()
|
||||
}
|
||||
}()
|
||||
|
||||
d.dl.Lock()
|
||||
_, ok := d.inflightRemovals[server.ID]
|
||||
if ok {
|
||||
d.logger.Info("removal of dead server is already initiated", "id", server.ID)
|
||||
d.dl.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
added = true
|
||||
d.inflightRemovals[server.ID] = true
|
||||
d.dl.Unlock()
|
||||
|
||||
d.logger.Info("removing dead server from raft configuration", "id", server.ID)
|
||||
if future := d.raft.RemoveServer(server.ID, 0, 0); future.Error() != nil {
|
||||
d.logger.Error("failed to remove server", "server_id", server.ID, "server_address", server.Address, "server_name", server.Name, "error", future.Error())
|
||||
return
|
||||
}
|
||||
|
||||
d.followerStates.Delete(string(server.ID))
|
||||
}()
|
||||
}
|
||||
|
||||
// SetFollowerStates sets the followerStates field in the backend to track peers
|
||||
// in the raft cluster.
|
||||
func (b *RaftBackend) SetFollowerStates(states *FollowerStates) {
|
||||
b.l.Lock()
|
||||
b.followerStates = states
|
||||
b.l.Unlock()
|
||||
}
|
||||
|
||||
// SetAutopilotConfig updates the autopilot configuration in the backend.
|
||||
func (b *RaftBackend) SetAutopilotConfig(config *AutopilotConfig) {
|
||||
b.l.Lock()
|
||||
b.autopilotConfig = config
|
||||
b.logger.Info("updated autopilot configuration", "config", b.autopilotConfig)
|
||||
b.l.Unlock()
|
||||
}
|
||||
|
||||
// AutopilotConfig returns the autopilot configuration in the backend.
|
||||
func (b *RaftBackend) AutopilotConfig() *AutopilotConfig {
|
||||
b.l.RLock()
|
||||
defer b.l.RUnlock()
|
||||
return b.autopilotConfig.Clone()
|
||||
}
|
||||
|
||||
func (b *RaftBackend) defaultAutopilotConfig() *AutopilotConfig {
|
||||
return &AutopilotConfig{
|
||||
CleanupDeadServers: false,
|
||||
LastContactThreshold: 10 * time.Second,
|
||||
DeadServerLastContactThreshold: 24 * time.Hour,
|
||||
MaxTrailingLogs: 1000,
|
||||
ServerStabilizationTime: 10 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *RaftBackend) AutopilotDisabled() bool {
|
||||
b.l.RLock()
|
||||
disabled := b.disableAutopilot
|
||||
b.l.RUnlock()
|
||||
return disabled
|
||||
}
|
||||
|
||||
// AutopilotExecutionStatus represents the current status of the autopilot background go routines
|
||||
type AutopilotExecutionStatus string
|
||||
|
||||
const (
|
||||
AutopilotNotRunning AutopilotExecutionStatus = "not-running"
|
||||
AutopilotRunning AutopilotExecutionStatus = "running"
|
||||
AutopilotShuttingDown AutopilotExecutionStatus = "shutting-down"
|
||||
)
|
||||
|
||||
func autopilotStatusToStatus(status autopilot.ExecutionStatus) AutopilotExecutionStatus {
|
||||
switch status {
|
||||
case autopilot.Running:
|
||||
return AutopilotRunning
|
||||
case autopilot.ShuttingDown:
|
||||
return AutopilotShuttingDown
|
||||
default:
|
||||
return AutopilotNotRunning
|
||||
}
|
||||
}
|
||||
|
||||
func (b *RaftBackend) startFollowerHeartbeatTracker() {
|
||||
b.l.RLock()
|
||||
tickerCh := b.followerHeartbeatTicker.C
|
||||
b.l.RUnlock()
|
||||
|
||||
for _ = range tickerCh {
|
||||
b.l.RLock()
|
||||
if b.autopilotConfig.CleanupDeadServers && b.autopilotConfig.DeadServerLastContactThreshold != 0 {
|
||||
b.followerStates.l.RLock()
|
||||
for _, state := range b.followerStates.followers {
|
||||
if state.LastHeartbeat.IsZero() || state.IsDead.Load() {
|
||||
continue
|
||||
}
|
||||
now := time.Now()
|
||||
if now.After(state.LastHeartbeat.Add(b.autopilotConfig.DeadServerLastContactThreshold)) {
|
||||
state.IsDead.Store(true)
|
||||
}
|
||||
}
|
||||
b.followerStates.l.RUnlock()
|
||||
}
|
||||
b.l.RUnlock()
|
||||
}
|
||||
}
|
||||
|
||||
// StopAutopilot stops a running autopilot instance. This should only be called
|
||||
// on the active node.
|
||||
func (b *RaftBackend) StopAutopilot() {
|
||||
b.l.Lock()
|
||||
defer b.l.Unlock()
|
||||
|
||||
if b.autopilot == nil {
|
||||
return
|
||||
}
|
||||
b.autopilot.Stop()
|
||||
b.followerHeartbeatTicker.Stop()
|
||||
}
|
||||
|
||||
// AutopilotState represents the health information retrieved from autopilot.
|
||||
type AutopilotState struct {
|
||||
ExecutionStatus AutopilotExecutionStatus `json:"execution_status"`
|
||||
Healthy bool `json:"healthy"`
|
||||
FailureTolerance int `json:"failure_tolerance"`
|
||||
OptimisticFailureTolerance int `json:"optimistic_failure_tolerance"`
|
||||
|
||||
Servers map[string]*AutopilotServer `json:"servers"`
|
||||
Leader string `json:"leader"`
|
||||
Voters []string `json:"voters"`
|
||||
NonVoters []string `json:"non_voters,omitempty"`
|
||||
}
|
||||
|
||||
// AutopilotServer represents the health information of individual server node
|
||||
// retrieved from autopilot.
|
||||
type AutopilotServer struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Address string `json:"address"`
|
||||
NodeStatus string `json:"node_status"`
|
||||
LastContact *ReadableDuration `json:"last_contact"`
|
||||
LastTerm uint64 `json:"last_term"`
|
||||
LastIndex uint64 `json:"last_index"`
|
||||
Healthy bool `json:"healthy"`
|
||||
StableSince time.Time `json:"stable_since"`
|
||||
Status string `json:"status"`
|
||||
Meta map[string]string `json:"meta"`
|
||||
}
|
||||
|
||||
// ReadableDuration is a duration type that is serialized to JSON in human readable format.
|
||||
type ReadableDuration time.Duration
|
||||
|
||||
func NewReadableDuration(dur time.Duration) *ReadableDuration {
|
||||
d := ReadableDuration(dur)
|
||||
return &d
|
||||
}
|
||||
|
||||
func (d *ReadableDuration) String() string {
|
||||
return d.Duration().String()
|
||||
}
|
||||
|
||||
func (d *ReadableDuration) Duration() time.Duration {
|
||||
if d == nil {
|
||||
return time.Duration(0)
|
||||
}
|
||||
return time.Duration(*d)
|
||||
}
|
||||
|
||||
func (d *ReadableDuration) MarshalJSON() ([]byte, error) {
|
||||
return []byte(fmt.Sprintf(`"%s"`, d.Duration().String())), nil
|
||||
}
|
||||
|
||||
func (d *ReadableDuration) UnmarshalJSON(raw []byte) (err error) {
|
||||
if d == nil {
|
||||
return fmt.Errorf("cannot unmarshal to nil pointer")
|
||||
}
|
||||
|
||||
var dur time.Duration
|
||||
str := string(raw)
|
||||
if len(str) >= 2 && str[0] == '"' && str[len(str)-1] == '"' {
|
||||
// quoted string
|
||||
dur, err = time.ParseDuration(str[1 : len(str)-1])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
// no quotes, not a string
|
||||
v, err := strconv.ParseFloat(str, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dur = time.Duration(v)
|
||||
}
|
||||
|
||||
*d = ReadableDuration(dur)
|
||||
return nil
|
||||
}
|
||||
|
||||
func stringIDs(ids []raft.ServerID) []string {
|
||||
out := make([]string, len(ids))
|
||||
for i, id := range ids {
|
||||
out[i] = string(id)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func autopilotToAPIState(state *autopilot.State) *AutopilotState {
|
||||
out := &AutopilotState{
|
||||
Healthy: state.Healthy,
|
||||
FailureTolerance: state.FailureTolerance,
|
||||
Leader: string(state.Leader),
|
||||
Voters: stringIDs(state.Voters),
|
||||
Servers: make(map[string]*AutopilotServer),
|
||||
}
|
||||
|
||||
for id, srv := range state.Servers {
|
||||
out.Servers[string(id)] = autopilotToAPIServer(srv)
|
||||
}
|
||||
|
||||
autopilotToAPIStateEnterprise(state, out)
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func autopilotToAPIServer(srv *autopilot.ServerState) *AutopilotServer {
|
||||
apiSrv := &AutopilotServer{
|
||||
ID: string(srv.Server.ID),
|
||||
Name: srv.Server.Name,
|
||||
Address: string(srv.Server.Address),
|
||||
NodeStatus: string(srv.Server.NodeStatus),
|
||||
LastContact: NewReadableDuration(srv.Stats.LastContact),
|
||||
LastTerm: srv.Stats.LastTerm,
|
||||
LastIndex: srv.Stats.LastIndex,
|
||||
Healthy: srv.Health.Healthy,
|
||||
StableSince: srv.Health.StableSince,
|
||||
Status: string(srv.State),
|
||||
Meta: srv.Server.Meta,
|
||||
}
|
||||
|
||||
autopilotToAPIServerEnterprise(srv, apiSrv)
|
||||
|
||||
return apiSrv
|
||||
}
|
||||
|
||||
// GetAutopilotServerState retrieves raft cluster state from autopilot to
|
||||
// return over the API.
|
||||
func (b *RaftBackend) GetAutopilotServerState(ctx context.Context) (*AutopilotState, error) {
|
||||
b.l.RLock()
|
||||
defer b.l.RUnlock()
|
||||
|
||||
if b.raft == nil {
|
||||
return nil, errors.New("raft storage is not initialized")
|
||||
}
|
||||
|
||||
if b.autopilot == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
apState := b.autopilot.GetState()
|
||||
if apState == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
state := autopilotToAPIState(apState)
|
||||
|
||||
apStatus, _ := b.autopilot.IsRunning()
|
||||
state.ExecutionStatus = autopilotStatusToStatus(apStatus)
|
||||
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func (b *RaftBackend) DisableAutopilot() {
|
||||
b.l.Lock()
|
||||
b.disableAutopilot = true
|
||||
b.l.Unlock()
|
||||
}
|
||||
|
||||
// SetupAutopilot gathers information required to configure autopilot and starts
|
||||
// it. If autopilot is disabled, this function does nothing.
|
||||
func (b *RaftBackend) SetupAutopilot(ctx context.Context, storageConfig *AutopilotConfig, followerStates *FollowerStates, disable bool) {
|
||||
b.l.Lock()
|
||||
if disable || os.Getenv("VAULT_RAFT_AUTOPILOT_DISABLE") != "" {
|
||||
b.disableAutopilot = true
|
||||
}
|
||||
|
||||
if b.disableAutopilot {
|
||||
b.logger.Info("disabling autopilot")
|
||||
b.l.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
// Start with a default config
|
||||
b.autopilotConfig = b.defaultAutopilotConfig()
|
||||
|
||||
// Merge the setting provided over the API
|
||||
b.autopilotConfig.Merge(storageConfig)
|
||||
|
||||
// Create the autopilot instance
|
||||
b.autopilot = autopilot.New(b.raft, newDelegate(b), autopilot.WithLogger(b.logger), autopilot.WithPromoter(b.autopilotPromoter()))
|
||||
b.followerStates = followerStates
|
||||
b.followerHeartbeatTicker = time.NewTicker(1 * time.Second)
|
||||
|
||||
b.l.Unlock()
|
||||
|
||||
b.logger.Info("starting autopilot", "config", b.autopilotConfig)
|
||||
b.autopilot.Start(ctx)
|
||||
|
||||
go b.startFollowerHeartbeatTracker()
|
||||
}
|
|
@ -87,6 +87,8 @@ func getRaftWithDir(t testing.TB, bootstrap bool, noStoreState bool, raftDir str
|
|||
|
||||
}
|
||||
|
||||
backend.DisableAutopilot()
|
||||
|
||||
return backend, raftDir
|
||||
}
|
||||
|
||||
|
|
|
@ -5,11 +5,34 @@ package raft
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
autopilot "github.com/hashicorp/raft-autopilot"
|
||||
)
|
||||
|
||||
const nonVotersAllowed = false
|
||||
|
||||
// AddPeer adds a new server to the raft cluster
|
||||
func (b *RaftBackend) AddNonVotingPeer(ctx context.Context, peerID, clusterAddr string) error {
|
||||
return errors.New("not implemented")
|
||||
func (b *RaftBackend) autopilotPromoter() autopilot.Promoter {
|
||||
return autopilot.DefaultPromoter()
|
||||
}
|
||||
|
||||
// AddNonVotingPeer adds a new server to the raft cluster
|
||||
func (b *RaftBackend) AddNonVotingPeer(ctx context.Context, peerID, clusterAddr string) error {
|
||||
return errors.New("adding non voting peer is not allowed")
|
||||
}
|
||||
|
||||
func autopilotToAPIServerEnterprise(_ *autopilot.ServerState, _ *AutopilotServer) {
|
||||
// noop in oss
|
||||
}
|
||||
|
||||
func autopilotToAPIStateEnterprise(state *autopilot.State, apiState *AutopilotState) {
|
||||
// Both are same in OSS
|
||||
apiState.OptimisticFailureTolerance = state.FailureTolerance
|
||||
}
|
||||
|
||||
func (d *Delegate) autopilotConfigExt() interface{} {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *Delegate) autopilotServerExt(_ string) interface{} {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -590,7 +590,7 @@ func TestBoltSnapshotStore_Listing(t *testing.T) {
|
|||
Level: hclog.Trace,
|
||||
})
|
||||
|
||||
fsm, err := NewFSM(parent, logger)
|
||||
fsm, err := NewFSM(parent, "", logger)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -655,7 +655,7 @@ func TestBoltSnapshotStore_CreateInstallSnapshot(t *testing.T) {
|
|||
Level: hclog.Trace,
|
||||
})
|
||||
|
||||
fsm, err := NewFSM(parent, logger)
|
||||
fsm, err := NewFSM(parent, "", logger)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -753,7 +753,7 @@ func TestBoltSnapshotStore_CreateInstallSnapshot(t *testing.T) {
|
|||
t.Fatal("expected snapshot installer object")
|
||||
}
|
||||
|
||||
newFSM, err := NewFSM(filepath.Dir(installer.Filename()), logger)
|
||||
newFSM, err := NewFSM(filepath.Dir(installer.Filename()), "", logger)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -812,7 +812,7 @@ func TestBoltSnapshotStore_CreateInstallSnapshot(t *testing.T) {
|
|||
|
||||
// Close/Reopen the db and make sure we still match
|
||||
fsm.Close()
|
||||
fsm, err = NewFSM(parent, logger)
|
||||
fsm, err = NewFSM(parent, "", logger)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
|
@ -320,6 +320,53 @@ func (x *ConfigurationValue) GetServers() []*Server {
|
|||
return nil
|
||||
}
|
||||
|
||||
type LocalNodeConfigValue struct {
|
||||
state protoimpl.MessageState
|
||||
sizeCache protoimpl.SizeCache
|
||||
unknownFields protoimpl.UnknownFields
|
||||
|
||||
DesiredSuffrage string `protobuf:"bytes,1,opt,name=desired_suffrage,json=desiredSuffrage,proto3" json:"desired_suffrage,omitempty"`
|
||||
}
|
||||
|
||||
func (x *LocalNodeConfigValue) Reset() {
|
||||
*x = LocalNodeConfigValue{}
|
||||
if protoimpl.UnsafeEnabled {
|
||||
mi := &file_physical_raft_types_proto_msgTypes[5]
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
}
|
||||
|
||||
func (x *LocalNodeConfigValue) String() string {
|
||||
return protoimpl.X.MessageStringOf(x)
|
||||
}
|
||||
|
||||
func (*LocalNodeConfigValue) ProtoMessage() {}
|
||||
|
||||
func (x *LocalNodeConfigValue) ProtoReflect() protoreflect.Message {
|
||||
mi := &file_physical_raft_types_proto_msgTypes[5]
|
||||
if protoimpl.UnsafeEnabled && x != nil {
|
||||
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
|
||||
if ms.LoadMessageInfo() == nil {
|
||||
ms.StoreMessageInfo(mi)
|
||||
}
|
||||
return ms
|
||||
}
|
||||
return mi.MessageOf(x)
|
||||
}
|
||||
|
||||
// Deprecated: Use LocalNodeConfigValue.ProtoReflect.Descriptor instead.
|
||||
func (*LocalNodeConfigValue) Descriptor() ([]byte, []int) {
|
||||
return file_physical_raft_types_proto_rawDescGZIP(), []int{5}
|
||||
}
|
||||
|
||||
func (x *LocalNodeConfigValue) GetDesiredSuffrage() string {
|
||||
if x != nil {
|
||||
return x.DesiredSuffrage
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var File_physical_raft_types_proto protoreflect.FileDescriptor
|
||||
|
||||
var file_physical_raft_types_proto_rawDesc = []byte{
|
||||
|
@ -349,10 +396,14 @@ var file_physical_raft_types_proto_rawDesc = []byte{
|
|||
0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x12, 0x26, 0x0a, 0x07, 0x73,
|
||||
0x65, 0x72, 0x76, 0x65, 0x72, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x72,
|
||||
0x61, 0x66, 0x74, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x52, 0x07, 0x73, 0x65, 0x72, 0x76,
|
||||
0x65, 0x72, 0x73, 0x42, 0x2a, 0x5a, 0x28, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f,
|
||||
0x6d, 0x2f, 0x68, 0x61, 0x73, 0x68, 0x69, 0x63, 0x6f, 0x72, 0x70, 0x2f, 0x76, 0x61, 0x75, 0x6c,
|
||||
0x74, 0x2f, 0x70, 0x68, 0x79, 0x73, 0x69, 0x63, 0x61, 0x6c, 0x2f, 0x72, 0x61, 0x66, 0x74, 0x62,
|
||||
0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
|
||||
0x65, 0x72, 0x73, 0x22, 0x41, 0x0a, 0x14, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x4e, 0x6f, 0x64, 0x65,
|
||||
0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x12, 0x29, 0x0a, 0x10, 0x64,
|
||||
0x65, 0x73, 0x69, 0x72, 0x65, 0x64, 0x5f, 0x73, 0x75, 0x66, 0x66, 0x72, 0x61, 0x67, 0x65, 0x18,
|
||||
0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x64, 0x65, 0x73, 0x69, 0x72, 0x65, 0x64, 0x53, 0x75,
|
||||
0x66, 0x66, 0x72, 0x61, 0x67, 0x65, 0x42, 0x2a, 0x5a, 0x28, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
|
||||
0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x68, 0x61, 0x73, 0x68, 0x69, 0x63, 0x6f, 0x72, 0x70, 0x2f, 0x76,
|
||||
0x61, 0x75, 0x6c, 0x74, 0x2f, 0x70, 0x68, 0x79, 0x73, 0x69, 0x63, 0x61, 0x6c, 0x2f, 0x72, 0x61,
|
||||
0x66, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
|
||||
}
|
||||
|
||||
var (
|
||||
|
@ -367,13 +418,14 @@ func file_physical_raft_types_proto_rawDescGZIP() []byte {
|
|||
return file_physical_raft_types_proto_rawDescData
|
||||
}
|
||||
|
||||
var file_physical_raft_types_proto_msgTypes = make([]protoimpl.MessageInfo, 5)
|
||||
var file_physical_raft_types_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
|
||||
var file_physical_raft_types_proto_goTypes = []interface{}{
|
||||
(*LogOperation)(nil), // 0: raft.LogOperation
|
||||
(*LogData)(nil), // 1: raft.LogData
|
||||
(*IndexValue)(nil), // 2: raft.IndexValue
|
||||
(*Server)(nil), // 3: raft.Server
|
||||
(*ConfigurationValue)(nil), // 4: raft.ConfigurationValue
|
||||
(*LogOperation)(nil), // 0: raft.LogOperation
|
||||
(*LogData)(nil), // 1: raft.LogData
|
||||
(*IndexValue)(nil), // 2: raft.IndexValue
|
||||
(*Server)(nil), // 3: raft.Server
|
||||
(*ConfigurationValue)(nil), // 4: raft.ConfigurationValue
|
||||
(*LocalNodeConfigValue)(nil), // 5: raft.LocalNodeConfigValue
|
||||
}
|
||||
var file_physical_raft_types_proto_depIdxs = []int32{
|
||||
0, // 0: raft.LogData.operations:type_name -> raft.LogOperation
|
||||
|
@ -451,6 +503,18 @@ func file_physical_raft_types_proto_init() {
|
|||
return nil
|
||||
}
|
||||
}
|
||||
file_physical_raft_types_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
|
||||
switch v := v.(*LocalNodeConfigValue); i {
|
||||
case 0:
|
||||
return &v.state
|
||||
case 1:
|
||||
return &v.sizeCache
|
||||
case 2:
|
||||
return &v.unknownFields
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
type x struct{}
|
||||
out := protoimpl.TypeBuilder{
|
||||
|
@ -458,7 +522,7 @@ func file_physical_raft_types_proto_init() {
|
|||
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
|
||||
RawDescriptor: file_physical_raft_types_proto_rawDesc,
|
||||
NumEnums: 0,
|
||||
NumMessages: 5,
|
||||
NumMessages: 6,
|
||||
NumExtensions: 0,
|
||||
NumServices: 0,
|
||||
},
|
||||
|
|
|
@ -37,3 +37,7 @@ message ConfigurationValue {
|
|||
uint64 index = 1;
|
||||
repeated Server servers = 2;
|
||||
}
|
||||
|
||||
message LocalNodeConfigValue{
|
||||
string desired_suffrage = 1;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,8 @@ import (
|
|||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/physical/raft"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/errwrap"
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
@ -517,8 +519,8 @@ type Core struct {
|
|||
// Stores request counters
|
||||
counters counters
|
||||
|
||||
// Stores the raft applied index for standby nodes
|
||||
raftFollowerStates *raftFollowerStates
|
||||
// raftFollowerStates tracks information about all the raft follower nodes.
|
||||
raftFollowerStates *raft.FollowerStates
|
||||
// Stop channel for raft TLS rotations
|
||||
raftTLSRotationStopCh chan struct{}
|
||||
// Stores the pending peers we are waiting to give answers
|
||||
|
@ -562,6 +564,9 @@ type Core struct {
|
|||
numExpirationWorkers int
|
||||
|
||||
IndexHeaderHMACKey uberAtomic.Value
|
||||
|
||||
// disableAutopilot is used to disable the autopilot subsystem in raft storage
|
||||
disableAutopilot bool
|
||||
}
|
||||
|
||||
// CoreConfig is used to parameterize a core
|
||||
|
@ -667,6 +672,9 @@ type CoreConfig struct {
|
|||
|
||||
// number of workers to use for lease revocation in the expiration manager
|
||||
NumExpirationWorkers int
|
||||
|
||||
// DisableAutopilot is used to disable autopilot subsystem in raft storage
|
||||
DisableAutopilot bool
|
||||
}
|
||||
|
||||
// GetServiceRegistration returns the config's ServiceRegistration, or nil if it does
|
||||
|
@ -813,6 +821,8 @@ func NewCore(conf *CoreConfig) (*Core, error) {
|
|||
activityLogConfig: conf.ActivityLogConfig,
|
||||
keyRotateGracePeriod: new(int64),
|
||||
numExpirationWorkers: conf.NumExpirationWorkers,
|
||||
raftFollowerStates: raft.NewFollowerStates(),
|
||||
disableAutopilot: conf.DisableAutopilot,
|
||||
}
|
||||
c.standbyStopCh.Store(make(chan struct{}))
|
||||
atomic.StoreUint32(c.sealed, 1)
|
||||
|
|
|
@ -0,0 +1,211 @@
|
|||
package rafttests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/api"
|
||||
"github.com/kr/pretty"
|
||||
|
||||
autopilot "github.com/hashicorp/raft-autopilot"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/hashicorp/vault/helper/namespace"
|
||||
"github.com/hashicorp/vault/helper/testhelpers"
|
||||
"github.com/hashicorp/vault/physical/raft"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
)
|
||||
|
||||
func TestRaft_Autopilot_Disable(t *testing.T) {
|
||||
cluster := raftCluster(t, &RaftClusterOpts{
|
||||
DisableFollowerJoins: true,
|
||||
InmemCluster: true,
|
||||
// Not setting EnableAutopilot here.
|
||||
})
|
||||
defer cluster.Cleanup()
|
||||
|
||||
client := cluster.Cores[0].Client
|
||||
|
||||
state, err := client.Sys().RaftAutopilotState()
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, nil, state)
|
||||
}
|
||||
|
||||
func TestRaft_Autopilot_Stabilization_And_State(t *testing.T) {
|
||||
cluster := raftCluster(t, &RaftClusterOpts{
|
||||
DisableFollowerJoins: true,
|
||||
InmemCluster: true,
|
||||
EnableAutopilot: true,
|
||||
})
|
||||
defer cluster.Cleanup()
|
||||
|
||||
// Check that autopilot execution state is running
|
||||
client := cluster.Cores[0].Client
|
||||
state, err := client.Sys().RaftAutopilotState()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, api.AutopilotRunning, state.ExecutionStatus)
|
||||
require.Equal(t, true, state.Healthy)
|
||||
require.Len(t, state.Servers, 1)
|
||||
require.Equal(t, "core-0", state.Servers["core-0"].ID)
|
||||
require.Equal(t, "alive", state.Servers["core-0"].NodeStatus)
|
||||
require.Equal(t, "leader", state.Servers["core-0"].Status)
|
||||
|
||||
config, err := client.Sys().RaftAutopilotConfiguration()
|
||||
require.NoError(t, err)
|
||||
|
||||
// Wait for 110% of the stabilization time to add nodes
|
||||
stabilizationKickOffWaitDuration := time.Duration(math.Ceil(1.1 * float64(config.ServerStabilizationTime)))
|
||||
time.Sleep(stabilizationKickOffWaitDuration)
|
||||
|
||||
joinAndStabilizeFunc := func(core *vault.TestClusterCore, nodeID string, numServers int) {
|
||||
joinFunc := func(core *vault.TestClusterCore) {
|
||||
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), []*raft.LeaderJoinInfo{
|
||||
{
|
||||
LeaderAPIAddr: client.Address(),
|
||||
TLSConfig: cluster.Cores[0].TLSConfig,
|
||||
Retry: true,
|
||||
},
|
||||
}, false)
|
||||
require.NoError(t, err)
|
||||
time.Sleep(1 * time.Second)
|
||||
cluster.UnsealCore(t, core)
|
||||
}
|
||||
joinFunc(core)
|
||||
|
||||
state, err = client.Sys().RaftAutopilotState()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, false, state.Healthy)
|
||||
require.Len(t, state.Servers, numServers)
|
||||
require.Equal(t, false, state.Servers[nodeID].Healthy)
|
||||
require.Equal(t, "alive", state.Servers[nodeID].NodeStatus)
|
||||
require.Equal(t, "non-voter", state.Servers[nodeID].Status)
|
||||
|
||||
// Wait till the stabilization period is over
|
||||
stabilizationWaitDuration := time.Duration(float64(config.ServerStabilizationTime))
|
||||
deadline := time.Now().Add(stabilizationWaitDuration)
|
||||
healthy := false
|
||||
for time.Now().Before(deadline) {
|
||||
state, err := client.Sys().RaftAutopilotState()
|
||||
require.NoError(t, err)
|
||||
if state.Healthy {
|
||||
healthy = true
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
if !healthy {
|
||||
t.Fatalf("cluster failed to stabilize")
|
||||
}
|
||||
|
||||
// Now that the server is stable, wait for autopilot to reconcile and
|
||||
// promotion to happen. Reconcile interval is 10 seconds. Bound it by
|
||||
// doubling.
|
||||
deadline = time.Now().Add(2 * autopilot.DefaultReconcileInterval)
|
||||
failed := true
|
||||
for time.Now().Before(deadline) {
|
||||
state, err = client.Sys().RaftAutopilotState()
|
||||
require.NoError(t, err)
|
||||
if state.Servers[nodeID].Status == "voter" {
|
||||
failed = false
|
||||
break
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
if failed {
|
||||
t.Fatalf("autopilot failed to promote node: id: %#v: state:%# v\n", nodeID, pretty.Formatter(state))
|
||||
}
|
||||
}
|
||||
joinAndStabilizeFunc(cluster.Cores[1], "core-1", 2)
|
||||
joinAndStabilizeFunc(cluster.Cores[2], "core-2", 3)
|
||||
state, err = client.Sys().RaftAutopilotState()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, []string{"core-0", "core-1", "core-2"}, state.Voters)
|
||||
}
|
||||
|
||||
func TestRaft_Autopilot_Configuration(t *testing.T) {
|
||||
cluster := raftCluster(t, &RaftClusterOpts{
|
||||
DisableFollowerJoins: true,
|
||||
InmemCluster: true,
|
||||
EnableAutopilot: true,
|
||||
})
|
||||
defer cluster.Cleanup()
|
||||
|
||||
client := cluster.Cores[0].Client
|
||||
configCheckFunc := func(config *api.AutopilotConfig) {
|
||||
conf, err := client.Sys().RaftAutopilotConfiguration()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, config, conf)
|
||||
}
|
||||
|
||||
writeConfigFunc := func(config map[string]interface{}, expectError bool) {
|
||||
resp, err := client.Logical().Write("sys/storage/raft/autopilot/configuration", config)
|
||||
if expectError {
|
||||
require.Error(t, err)
|
||||
return
|
||||
}
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, resp)
|
||||
}
|
||||
|
||||
// Ensure autopilot's default config has taken effect
|
||||
config := &api.AutopilotConfig{
|
||||
CleanupDeadServers: false,
|
||||
DeadServerLastContactThreshold: 24 * time.Hour,
|
||||
LastContactThreshold: 10 * time.Second,
|
||||
MaxTrailingLogs: 1000,
|
||||
ServerStabilizationTime: 10 * time.Second,
|
||||
}
|
||||
configCheckFunc(config)
|
||||
|
||||
// Update config
|
||||
writableConfig := map[string]interface{}{
|
||||
"cleanup_dead_servers": true,
|
||||
"dead_server_last_contact_threshold": "100h",
|
||||
"last_contact_threshold": "100s",
|
||||
"max_trailing_logs": 100,
|
||||
"min_quorum": 100,
|
||||
"server_stabilization_time": "100s",
|
||||
}
|
||||
writeConfigFunc(writableConfig, false)
|
||||
|
||||
// Ensure update has taken effect
|
||||
config.CleanupDeadServers = true
|
||||
config.DeadServerLastContactThreshold = 100 * time.Hour
|
||||
config.LastContactThreshold = 100 * time.Second
|
||||
config.MaxTrailingLogs = 100
|
||||
config.MinQuorum = 100
|
||||
config.ServerStabilizationTime = 100 * time.Second
|
||||
configCheckFunc(config)
|
||||
|
||||
// Update some fields and leave the rest as it is.
|
||||
writableConfig = map[string]interface{}{
|
||||
"dead_server_last_contact_threshold": "50h",
|
||||
"max_trailing_logs": 50,
|
||||
"server_stabilization_time": "50s",
|
||||
}
|
||||
writeConfigFunc(writableConfig, false)
|
||||
|
||||
// Check update
|
||||
config.DeadServerLastContactThreshold = 50 * time.Hour
|
||||
config.MaxTrailingLogs = 50
|
||||
config.ServerStabilizationTime = 50 * time.Second
|
||||
configCheckFunc(config)
|
||||
|
||||
// Check error case
|
||||
writableConfig = map[string]interface{}{
|
||||
"min_quorum": 2,
|
||||
"dead_server_last_contact_threshold": "48h",
|
||||
}
|
||||
writeConfigFunc(writableConfig, true)
|
||||
configCheckFunc(config)
|
||||
|
||||
// Ensure that the configuration stays across reboots
|
||||
leaderCore := cluster.Cores[0]
|
||||
testhelpers.EnsureCoreSealed(t, cluster.Cores[0])
|
||||
cluster.UnsealCore(t, leaderCore)
|
||||
vault.TestWaitActive(t, leaderCore.Core)
|
||||
configCheckFunc(config)
|
||||
}
|
|
@ -8,6 +8,7 @@ import (
|
|||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -23,28 +24,60 @@ import (
|
|||
vaulthttp "github.com/hashicorp/vault/http"
|
||||
"github.com/hashicorp/vault/physical/raft"
|
||||
"github.com/hashicorp/vault/sdk/helper/logging"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
vaultcluster "github.com/hashicorp/vault/vault/cluster"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/net/http2"
|
||||
)
|
||||
|
||||
func raftCluster(t testing.TB) *vault.TestCluster {
|
||||
return raftClusterWithPerfStandby(t, false)
|
||||
type RaftClusterOpts struct {
|
||||
DisableFollowerJoins bool
|
||||
InmemCluster bool
|
||||
EnableAutopilot bool
|
||||
PhysicalFactoryConfig map[string]interface{}
|
||||
DisablePerfStandby bool
|
||||
}
|
||||
|
||||
func raftClusterWithPerfStandby(t testing.TB, disablePerfStandby bool) *vault.TestCluster {
|
||||
func raftCluster(t testing.TB, ropts *RaftClusterOpts) *vault.TestCluster {
|
||||
if ropts == nil {
|
||||
ropts = &RaftClusterOpts{}
|
||||
}
|
||||
|
||||
conf := &vault.CoreConfig{
|
||||
CredentialBackends: map[string]logical.Factory{
|
||||
"userpass": credUserpass.Factory,
|
||||
},
|
||||
DisableAutopilot: !ropts.EnableAutopilot,
|
||||
}
|
||||
conf.DisablePerformanceStandby = disablePerfStandby
|
||||
|
||||
var opts = vault.TestClusterOptions{HandlerFunc: vaulthttp.Handler}
|
||||
var opts = vault.TestClusterOptions{
|
||||
HandlerFunc: vaulthttp.Handler,
|
||||
}
|
||||
opts.Logger = logging.NewVaultLogger(hclog.Trace).Named(t.Name())
|
||||
|
||||
if ropts.InmemCluster {
|
||||
inmemCluster, err := vaultcluster.NewInmemLayerCluster("inmem-cluster", 3, hclog.New(&hclog.LoggerOptions{
|
||||
Mutex: &sync.Mutex{},
|
||||
Level: hclog.Trace,
|
||||
Name: "inmem-cluster",
|
||||
}))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
opts.ClusterLayers = inmemCluster
|
||||
}
|
||||
|
||||
opts.PhysicalFactoryConfig = ropts.PhysicalFactoryConfig
|
||||
conf.DisablePerformanceStandby = ropts.DisablePerfStandby
|
||||
|
||||
teststorage.RaftBackendSetup(conf, &opts)
|
||||
|
||||
if ropts.DisableFollowerJoins {
|
||||
opts.SetupFunc = nil
|
||||
}
|
||||
|
||||
cluster := vault.NewTestCluster(t, conf, &opts)
|
||||
cluster.Start()
|
||||
vault.TestWaitActive(t, cluster.Cores[0].Core)
|
||||
|
@ -230,7 +263,7 @@ func TestRaft_Join(t *testing.T) {
|
|||
|
||||
func TestRaft_RemovePeer(t *testing.T) {
|
||||
t.Parallel()
|
||||
cluster := raftCluster(t)
|
||||
cluster := raftCluster(t, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
for i, c := range cluster.Cores {
|
||||
|
@ -273,7 +306,7 @@ func TestRaft_RemovePeer(t *testing.T) {
|
|||
|
||||
func TestRaft_Configuration(t *testing.T) {
|
||||
t.Parallel()
|
||||
cluster := raftCluster(t)
|
||||
cluster := raftCluster(t, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
for i, c := range cluster.Cores {
|
||||
|
@ -320,7 +353,7 @@ func TestRaft_Configuration(t *testing.T) {
|
|||
|
||||
func TestRaft_ShamirUnseal(t *testing.T) {
|
||||
t.Parallel()
|
||||
cluster := raftCluster(t)
|
||||
cluster := raftCluster(t, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
for i, c := range cluster.Cores {
|
||||
|
@ -332,7 +365,7 @@ func TestRaft_ShamirUnseal(t *testing.T) {
|
|||
|
||||
func TestRaft_SnapshotAPI(t *testing.T) {
|
||||
t.Parallel()
|
||||
cluster := raftCluster(t)
|
||||
cluster := raftCluster(t, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
leaderClient := cluster.Cores[0].Client
|
||||
|
@ -467,7 +500,7 @@ func TestRaft_SnapshotAPI_RekeyRotate_Backward(t *testing.T) {
|
|||
tCaseLocal := tCase
|
||||
t.Parallel()
|
||||
|
||||
cluster := raftClusterWithPerfStandby(t, tCaseLocal.DisablePerfStandby)
|
||||
cluster := raftCluster(t, &RaftClusterOpts{DisablePerfStandby: tCaseLocal.DisablePerfStandby})
|
||||
defer cluster.Cleanup()
|
||||
|
||||
leaderClient := cluster.Cores[0].Client
|
||||
|
@ -668,7 +701,7 @@ func TestRaft_SnapshotAPI_RekeyRotate_Forward(t *testing.T) {
|
|||
tCaseLocal := tCase
|
||||
t.Parallel()
|
||||
|
||||
cluster := raftClusterWithPerfStandby(t, tCaseLocal.DisablePerfStandby)
|
||||
cluster := raftCluster(t, &RaftClusterOpts{DisablePerfStandby: tCaseLocal.DisablePerfStandby})
|
||||
defer cluster.Cleanup()
|
||||
|
||||
leaderClient := cluster.Cores[0].Client
|
||||
|
@ -855,7 +888,7 @@ func TestRaft_SnapshotAPI_RekeyRotate_Forward(t *testing.T) {
|
|||
|
||||
func TestRaft_SnapshotAPI_DifferentCluster(t *testing.T) {
|
||||
t.Parallel()
|
||||
cluster := raftCluster(t)
|
||||
cluster := raftCluster(t, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
leaderClient := cluster.Cores[0].Client
|
||||
|
@ -901,7 +934,7 @@ func TestRaft_SnapshotAPI_DifferentCluster(t *testing.T) {
|
|||
|
||||
// Cluster 2
|
||||
{
|
||||
cluster2 := raftCluster(t)
|
||||
cluster2 := raftCluster(t, nil)
|
||||
defer cluster2.Cleanup()
|
||||
|
||||
leaderClient := cluster2.Cores[0].Client
|
||||
|
@ -948,7 +981,7 @@ func TestRaft_SnapshotAPI_DifferentCluster(t *testing.T) {
|
|||
}
|
||||
|
||||
func BenchmarkRaft_SingleNode(b *testing.B) {
|
||||
cluster := raftCluster(b)
|
||||
cluster := raftCluster(b, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
leaderClient := cluster.Cores[0].Client
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
package sealmigration
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/vault/helper/testhelpers"
|
||||
"github.com/hashicorp/vault/helper/testhelpers/teststorage"
|
||||
"github.com/hashicorp/vault/sdk/helper/logging"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type testFunc func(t *testing.T, logger hclog.Logger, storage teststorage.ReusableStorage, basePort int)
|
||||
|
|
|
@ -4,19 +4,20 @@ import (
|
|||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-test/deep"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-kms-wrapping"
|
||||
wrapping "github.com/hashicorp/go-kms-wrapping"
|
||||
"github.com/hashicorp/vault/api"
|
||||
"github.com/hashicorp/vault/helper/namespace"
|
||||
"github.com/hashicorp/vault/helper/testhelpers"
|
||||
"github.com/hashicorp/vault/helper/testhelpers/seal"
|
||||
sealhelper "github.com/hashicorp/vault/helper/testhelpers/seal"
|
||||
"github.com/hashicorp/vault/helper/testhelpers/teststorage"
|
||||
"github.com/hashicorp/vault/http"
|
||||
"github.com/hashicorp/vault/physical/raft"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -206,6 +207,7 @@ func migrateFromTransitToShamir_Pre14(t *testing.T, logger hclog.Logger, storage
|
|||
UnwrapSealFunc: sealFunc,
|
||||
}
|
||||
storage.Setup(&conf, &opts)
|
||||
conf.DisableAutopilot = true
|
||||
cluster := vault.NewTestCluster(t, &conf, &opts)
|
||||
cluster.Start()
|
||||
defer func() {
|
||||
|
@ -267,7 +269,9 @@ func migrateFromTransitToShamir_Pre14(t *testing.T, logger hclog.Logger, storage
|
|||
func migrateFromShamirToTransit_Pre14(t *testing.T, logger hclog.Logger, storage teststorage.ReusableStorage, basePort int, tss *sealhelper.TransitSealServer, rootToken string, recoveryKeys [][]byte) func() vault.Seal {
|
||||
var baseClusterPort = basePort + 10
|
||||
|
||||
var conf = vault.CoreConfig{}
|
||||
var conf = vault.CoreConfig{
|
||||
DisableAutopilot: true,
|
||||
}
|
||||
var opts = vault.TestClusterOptions{
|
||||
Logger: logger.Named("migrateFromShamirToTransit"),
|
||||
HandlerFunc: http.Handler,
|
||||
|
@ -561,7 +565,9 @@ func initializeShamir(t *testing.T, logger hclog.Logger, storage teststorage.Reu
|
|||
var baseClusterPort = basePort + 10
|
||||
|
||||
// Start the cluster
|
||||
var conf = vault.CoreConfig{}
|
||||
var conf = vault.CoreConfig{
|
||||
DisableAutopilot: true,
|
||||
}
|
||||
var opts = vault.TestClusterOptions{
|
||||
Logger: logger.Named("initializeShamir"),
|
||||
HandlerFunc: http.Handler,
|
||||
|
@ -612,7 +618,9 @@ func runShamir(t *testing.T, logger hclog.Logger, storage teststorage.ReusableSt
|
|||
var baseClusterPort = basePort + 10
|
||||
|
||||
// Start the cluster
|
||||
var conf = vault.CoreConfig{}
|
||||
var conf = vault.CoreConfig{
|
||||
DisableAutopilot: true,
|
||||
}
|
||||
var opts = vault.TestClusterOptions{
|
||||
Logger: logger.Named("runShamir"),
|
||||
HandlerFunc: http.Handler,
|
||||
|
@ -681,7 +689,9 @@ func InitializeTransit(t *testing.T, logger hclog.Logger, storage teststorage.Re
|
|||
var baseClusterPort = basePort + 10
|
||||
|
||||
// Start the cluster
|
||||
var conf = vault.CoreConfig{}
|
||||
var conf = vault.CoreConfig{
|
||||
DisableAutopilot: true,
|
||||
}
|
||||
var opts = vault.TestClusterOptions{
|
||||
Logger: logger.Named("initializeTransit"),
|
||||
HandlerFunc: http.Handler,
|
||||
|
@ -734,7 +744,9 @@ func runAutoseal(t *testing.T, logger hclog.Logger, storage teststorage.Reusable
|
|||
var baseClusterPort = basePort + 10
|
||||
|
||||
// Start the cluster
|
||||
var conf = vault.CoreConfig{}
|
||||
var conf = vault.CoreConfig{
|
||||
DisableAutopilot: true,
|
||||
}
|
||||
var opts = vault.TestClusterOptions{
|
||||
Logger: logger.Named("runTransit"),
|
||||
HandlerFunc: http.Handler,
|
||||
|
|
|
@ -5,16 +5,19 @@ import (
|
|||
"crypto/subtle"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/framework"
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/sdk/physical"
|
||||
|
||||
proto "github.com/golang/protobuf/proto"
|
||||
wrapping "github.com/hashicorp/go-kms-wrapping"
|
||||
uuid "github.com/hashicorp/go-uuid"
|
||||
"github.com/hashicorp/vault/helper/namespace"
|
||||
"github.com/hashicorp/vault/physical/raft"
|
||||
"github.com/hashicorp/vault/sdk/framework"
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/sdk/physical"
|
||||
)
|
||||
|
||||
// raftStoragePaths returns paths for use when raft is the storage mechanism.
|
||||
|
@ -145,6 +148,60 @@ func (b *SystemBackend) raftStoragePaths() []*framework.Path {
|
|||
HelpSynopsis: strings.TrimSpace(sysRaftHelp["raft-snapshot-force"][0]),
|
||||
HelpDescription: strings.TrimSpace(sysRaftHelp["raft-snapshot-force"][1]),
|
||||
},
|
||||
{
|
||||
Pattern: "storage/raft/autopilot/state",
|
||||
Operations: map[logical.Operation]framework.OperationHandler{
|
||||
logical.ReadOperation: &framework.PathOperation{
|
||||
Callback: b.handleStorageRaftAutopilotState(),
|
||||
Summary: "Returns the state of the raft cluster under integrated storage as seen by autopilot.",
|
||||
},
|
||||
},
|
||||
|
||||
HelpSynopsis: strings.TrimSpace(sysRaftHelp["raft-autopilot-state"][0]),
|
||||
HelpDescription: strings.TrimSpace(sysRaftHelp["raft-autopilot-state"][1]),
|
||||
},
|
||||
{
|
||||
Pattern: "storage/raft/autopilot/configuration",
|
||||
|
||||
Fields: map[string]*framework.FieldSchema{
|
||||
"cleanup_dead_servers": {
|
||||
Type: framework.TypeBool,
|
||||
Description: "Controls whether to remove dead servers from the Raft peer list periodically or when a new server joins.",
|
||||
},
|
||||
"last_contact_threshold": {
|
||||
Type: framework.TypeDurationSecond,
|
||||
Description: "Limit on the amount of time a server can go without leader contact before being considered unhealthy.",
|
||||
},
|
||||
"dead_server_last_contact_threshold": {
|
||||
Type: framework.TypeDurationSecond,
|
||||
Description: "Limit on the amount of time a server can go without leader contact before being considered failed. This takes effect only when cleanup_dead_servers is set.",
|
||||
},
|
||||
"max_trailing_logs": {
|
||||
Type: framework.TypeInt,
|
||||
Description: "Amount of entries in the Raft Log that a server can be behind before being considered unhealthy.",
|
||||
},
|
||||
"min_quorum": {
|
||||
Type: framework.TypeInt,
|
||||
Description: "Minimum number of servers allowed in a cluster before autopilot can prune dead servers. This should at least be 3.",
|
||||
},
|
||||
"server_stabilization_time": {
|
||||
Type: framework.TypeDurationSecond,
|
||||
Description: "Minimum amount of time a server must be in a stable, healthy state before it can be added to the cluster.",
|
||||
},
|
||||
},
|
||||
|
||||
Operations: map[logical.Operation]framework.OperationHandler{
|
||||
logical.ReadOperation: &framework.PathOperation{
|
||||
Callback: b.handleStorageRaftAutopilotConfigRead(),
|
||||
},
|
||||
logical.UpdateOperation: &framework.PathOperation{
|
||||
Callback: b.handleStorageRaftAutopilotConfigUpdate(),
|
||||
},
|
||||
},
|
||||
|
||||
HelpSynopsis: strings.TrimSpace(sysRaftHelp["raft-autopilot-configuration"][0]),
|
||||
HelpDescription: strings.TrimSpace(sysRaftHelp["raft-autopilot-configuration"][1]),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -184,7 +241,7 @@ func (b *SystemBackend) handleRaftRemovePeerUpdate() framework.OperationFunc {
|
|||
return nil, err
|
||||
}
|
||||
if b.Core.raftFollowerStates != nil {
|
||||
b.Core.raftFollowerStates.delete(serverID)
|
||||
b.Core.raftFollowerStates.Delete(serverID)
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
|
@ -296,8 +353,16 @@ func (b *SystemBackend) handleRaftBootstrapAnswerWrite() framework.OperationFunc
|
|||
return nil, err
|
||||
}
|
||||
|
||||
var desiredSuffrage string
|
||||
switch nonVoter {
|
||||
case true:
|
||||
desiredSuffrage = "voter"
|
||||
default:
|
||||
desiredSuffrage = "non-voter"
|
||||
}
|
||||
|
||||
if b.Core.raftFollowerStates != nil {
|
||||
b.Core.raftFollowerStates.update(serverID, 0)
|
||||
b.Core.raftFollowerStates.Update(serverID, 0, 0, desiredSuffrage)
|
||||
}
|
||||
|
||||
peers, err := raftBackend.Peers(ctx)
|
||||
|
@ -335,6 +400,139 @@ func (b *SystemBackend) handleStorageRaftSnapshotRead() framework.OperationFunc
|
|||
}
|
||||
}
|
||||
|
||||
func (b *SystemBackend) handleStorageRaftAutopilotState() framework.OperationFunc {
|
||||
return func(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
raftBackend, ok := b.Core.underlyingPhysical.(*raft.RaftBackend)
|
||||
if !ok {
|
||||
return logical.ErrorResponse("raft storage is not in use"), logical.ErrInvalidRequest
|
||||
}
|
||||
|
||||
state, err := raftBackend.GetAutopilotServerState(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if state == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &logical.Response{
|
||||
Data: map[string]interface{}{
|
||||
"execution_status": state.ExecutionStatus,
|
||||
"healthy": state.Healthy,
|
||||
"failure_tolerance": state.FailureTolerance,
|
||||
"optimistic_failure_tolerance": state.OptimisticFailureTolerance,
|
||||
"servers": state.Servers,
|
||||
"leader": state.Leader,
|
||||
"voters": state.Voters,
|
||||
"non_voters": state.NonVoters,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (b *SystemBackend) handleStorageRaftAutopilotConfigRead() framework.OperationFunc {
|
||||
return func(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
raftStorage, ok := b.Core.underlyingPhysical.(*raft.RaftBackend)
|
||||
if !ok {
|
||||
return logical.ErrorResponse("raft storage is not in use"), logical.ErrInvalidRequest
|
||||
}
|
||||
|
||||
config := raftStorage.AutopilotConfig()
|
||||
if config == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &logical.Response{
|
||||
Data: map[string]interface{}{
|
||||
"cleanup_dead_servers": config.CleanupDeadServers,
|
||||
"last_contact_threshold": config.LastContactThreshold.String(),
|
||||
"dead_server_last_contact_threshold": config.DeadServerLastContactThreshold.String(),
|
||||
"max_trailing_logs": config.MaxTrailingLogs,
|
||||
"min_quorum": config.MinQuorum,
|
||||
"server_stabilization_time": config.ServerStabilizationTime.String(),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (b *SystemBackend) handleStorageRaftAutopilotConfigUpdate() framework.OperationFunc {
|
||||
return func(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
raftStorage, ok := b.Core.underlyingPhysical.(*raft.RaftBackend)
|
||||
if !ok {
|
||||
return logical.ErrorResponse("raft storage is not in use"), logical.ErrInvalidRequest
|
||||
}
|
||||
|
||||
// Read autopilot configuration from storage
|
||||
config, err := b.Core.loadAutopilotConfiguration(ctx)
|
||||
if err != nil {
|
||||
b.logger.Error("failed to load autopilot config from storage when setting up cluster; continuing since autopilot falls back to default config", "error", err)
|
||||
}
|
||||
if config == nil {
|
||||
config = &raft.AutopilotConfig{}
|
||||
}
|
||||
|
||||
persist := false
|
||||
cleanupDeadServers, ok := d.GetOk("cleanup_dead_servers")
|
||||
if ok {
|
||||
if cleanupDeadServers.(bool) {
|
||||
config.CleanupDeadServersValue = raft.CleanupDeadServersTrue
|
||||
} else {
|
||||
config.CleanupDeadServersValue = raft.CleanupDeadServersFalse
|
||||
}
|
||||
persist = true
|
||||
}
|
||||
lastContactThreshold, ok := d.GetOk("last_contact_threshold")
|
||||
if ok {
|
||||
config.LastContactThreshold = time.Duration(lastContactThreshold.(int)) * time.Second
|
||||
persist = true
|
||||
}
|
||||
deadServerLastContactThreshold, ok := d.GetOk("dead_server_last_contact_threshold")
|
||||
if ok {
|
||||
config.DeadServerLastContactThreshold = time.Duration(deadServerLastContactThreshold.(int)) * time.Second
|
||||
persist = true
|
||||
}
|
||||
maxTrailingLogs, ok := d.GetOk("max_trailing_logs")
|
||||
if ok {
|
||||
config.MaxTrailingLogs = uint64(maxTrailingLogs.(int))
|
||||
persist = true
|
||||
}
|
||||
minQuorum, ok := d.GetOk("min_quorum")
|
||||
if ok {
|
||||
config.MinQuorum = uint(minQuorum.(int))
|
||||
persist = true
|
||||
}
|
||||
serverStabilizationTime, ok := d.GetOk("server_stabilization_time")
|
||||
if ok {
|
||||
config.ServerStabilizationTime = time.Duration(serverStabilizationTime.(int)) * time.Second
|
||||
persist = true
|
||||
}
|
||||
|
||||
effectiveConf := raftStorage.AutopilotConfig()
|
||||
effectiveConf.Merge(config)
|
||||
|
||||
if effectiveConf.CleanupDeadServers && effectiveConf.MinQuorum < 3 {
|
||||
return logical.ErrorResponse(fmt.Sprintf("min_quorum must be set when cleanup_dead_servers is set and it should at least be 3; cleanup_dead_servers: %#v, min_quorum: %#v", effectiveConf.CleanupDeadServers, effectiveConf.MinQuorum)), logical.ErrInvalidRequest
|
||||
}
|
||||
|
||||
// Persist only the user supplied fields
|
||||
if persist {
|
||||
entry, err := logical.StorageEntryJSON(raftAutopilotConfigurationStoragePath, config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := b.Core.barrier.Put(ctx, entry); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Set the effectiveConfig
|
||||
raftStorage.SetAutopilotConfig(effectiveConf)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (b *SystemBackend) handleStorageRaftSnapshotWrite(force bool) framework.OperationFunc {
|
||||
return func(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
raftStorage, ok := b.Core.underlyingPhysical.(*raft.RaftBackend)
|
||||
|
@ -475,4 +673,12 @@ var sysRaftHelp = map[string][2]string{
|
|||
"Force restore a raft cluster snapshot",
|
||||
"",
|
||||
},
|
||||
"raft-autopilot-state": {
|
||||
"Returns the state of the raft cluster under integrated storage as seen by autopilot.",
|
||||
"",
|
||||
},
|
||||
"raft-autopilot-configuration": {
|
||||
"Returns autopilot configuration.",
|
||||
"",
|
||||
},
|
||||
}
|
||||
|
|
115
vault/raft.go
115
vault/raft.go
|
@ -5,7 +5,6 @@ import (
|
|||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
@ -35,53 +34,12 @@ var (
|
|||
raftTLSStoragePath = "core/raft/tls"
|
||||
raftTLSRotationPeriod = 24 * time.Hour
|
||||
|
||||
raftAutopilotConfigurationStoragePath = "core/raft/autopilot/configuration"
|
||||
|
||||
// TestingUpdateClusterAddr is used in tests to override the cluster address
|
||||
TestingUpdateClusterAddr uint32
|
||||
)
|
||||
|
||||
type raftFollowerStates struct {
|
||||
l sync.RWMutex
|
||||
followers map[string]uint64
|
||||
}
|
||||
|
||||
func (s *raftFollowerStates) update(nodeID string, appliedIndex uint64) {
|
||||
s.l.Lock()
|
||||
s.followers[nodeID] = appliedIndex
|
||||
s.l.Unlock()
|
||||
}
|
||||
func (s *raftFollowerStates) delete(nodeID string) {
|
||||
s.l.RLock()
|
||||
delete(s.followers, nodeID)
|
||||
s.l.RUnlock()
|
||||
}
|
||||
func (s *raftFollowerStates) get(nodeID string) uint64 {
|
||||
s.l.RLock()
|
||||
index := s.followers[nodeID]
|
||||
s.l.RUnlock()
|
||||
return index
|
||||
}
|
||||
func (s *raftFollowerStates) minIndex() uint64 {
|
||||
var min uint64 = math.MaxUint64
|
||||
minFunc := func(a, b uint64) uint64 {
|
||||
if a > b {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
s.l.RLock()
|
||||
for _, i := range s.followers {
|
||||
min = minFunc(min, i)
|
||||
}
|
||||
s.l.RUnlock()
|
||||
|
||||
if min == math.MaxUint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return min
|
||||
}
|
||||
|
||||
func (c *Core) GetRaftIndexes() (committed uint64, applied uint64) {
|
||||
c.stateLock.RLock()
|
||||
defer c.stateLock.RUnlock()
|
||||
|
@ -98,7 +56,7 @@ func (c *Core) GetRaftIndexes() (committed uint64, applied uint64) {
|
|||
// up and enables the cluster handler.
|
||||
func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
|
||||
raftBackend := c.getRaftBackend()
|
||||
if raftBackend == nil || raftBackend.Initialized() {
|
||||
if raftBackend == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -161,6 +119,7 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
|
|||
}
|
||||
|
||||
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))
|
||||
|
||||
if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{
|
||||
TLSKeyring: raftTLS,
|
||||
ClusterListener: c.getClusterListener(),
|
||||
|
@ -198,11 +157,35 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
|
|||
}
|
||||
|
||||
func (c *Core) setupRaftActiveNode(ctx context.Context) error {
|
||||
raftBackend := c.getRaftBackend()
|
||||
if raftBackend == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
c.logger.Info("starting raft active node")
|
||||
|
||||
autopilotConfig, err := c.loadAutopilotConfiguration(ctx)
|
||||
if err != nil {
|
||||
c.logger.Error("failed to load autopilot config from storage when setting up cluster; continuing since autopilot falls back to default config", "error", err)
|
||||
}
|
||||
raftBackend.SetupAutopilot(c.activeContext, autopilotConfig, c.raftFollowerStates, c.disableAutopilot)
|
||||
|
||||
c.pendingRaftPeers = &sync.Map{}
|
||||
return c.startPeriodicRaftTLSRotate(ctx)
|
||||
}
|
||||
|
||||
func (c *Core) stopRaftActiveNode() {
|
||||
raftBackend := c.getRaftBackend()
|
||||
if raftBackend == nil {
|
||||
return
|
||||
}
|
||||
|
||||
c.logger.Info("stopping raft active node")
|
||||
|
||||
if !raftBackend.AutopilotDisabled() {
|
||||
raftBackend.StopAutopilot()
|
||||
}
|
||||
|
||||
c.pendingRaftPeers = nil
|
||||
c.stopPeriodicRaftTLSRotate()
|
||||
}
|
||||
|
@ -334,9 +317,8 @@ func (c *Core) raftTLSRotateDirect(ctx context.Context, logger hclog.Logger, sto
|
|||
// to reconnect with the cluster. Additionally, only one outstanding key
|
||||
// is allowed for this same reason (max keyring size of 2).
|
||||
func (c *Core) raftTLSRotatePhased(ctx context.Context, logger hclog.Logger, raftBackend *raft.RaftBackend, stopCh chan struct{}) error {
|
||||
followerStates := &raftFollowerStates{
|
||||
followers: make(map[string]uint64),
|
||||
}
|
||||
followerStates := c.raftFollowerStates
|
||||
followerStates.Clear()
|
||||
|
||||
// Pre-populate the follower list with the set of peers.
|
||||
raftConfig, err := raftBackend.GetConfiguration(ctx)
|
||||
|
@ -345,10 +327,9 @@ func (c *Core) raftTLSRotatePhased(ctx context.Context, logger hclog.Logger, raf
|
|||
}
|
||||
for _, server := range raftConfig.Servers {
|
||||
if server.NodeID != raftBackend.NodeID() {
|
||||
followerStates.update(server.NodeID, 0)
|
||||
followerStates.Update(server.NodeID, 0, 0, "voter")
|
||||
}
|
||||
}
|
||||
c.raftFollowerStates = followerStates
|
||||
|
||||
// rotateKeyring writes new key data to the keyring and adds an applied
|
||||
// index that is used to verify it has been committed. The keys written in
|
||||
|
@ -437,7 +418,7 @@ func (c *Core) raftTLSRotatePhased(ctx context.Context, logger hclog.Logger, raf
|
|||
case keyring.Keys[1].AppliedIndex != keyring.AppliedIndex:
|
||||
// We haven't fully committed the new key, continue here
|
||||
return nil
|
||||
case followerStates.minIndex() < keyring.AppliedIndex:
|
||||
case followerStates.MinIndex() < keyring.AppliedIndex:
|
||||
// Not all the followers have applied the latest key
|
||||
return nil
|
||||
}
|
||||
|
@ -574,7 +555,7 @@ func (c *Core) stopPeriodicRaftTLSRotate() {
|
|||
close(c.raftTLSRotationStopCh)
|
||||
}
|
||||
c.raftTLSRotationStopCh = nil
|
||||
c.raftFollowerStates = nil
|
||||
c.raftFollowerStates.Clear()
|
||||
}
|
||||
|
||||
func (c *Core) checkRaftTLSKeyUpgrades(ctx context.Context) error {
|
||||
|
@ -716,6 +697,11 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
|||
return false, errors.New("raft backend not in use")
|
||||
}
|
||||
|
||||
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
|
||||
c.logger.Error("failed to set desired suffrage for this node", "error", err)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
init, err := c.InitializedLocally(ctx)
|
||||
if err != nil {
|
||||
return false, errwrap.Wrapf("failed to check if core is initialized: {{err}}", err)
|
||||
|
@ -731,7 +717,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
|||
// Check on seal status and storage type before proceeding:
|
||||
// If raft is used for storage, core needs to be sealed
|
||||
if !isRaftHAOnly && !c.Sealed() {
|
||||
c.logger.Error("node must be seal before joining")
|
||||
c.logger.Error("node must be sealed before joining")
|
||||
return false, errors.New("node must be sealed before joining")
|
||||
}
|
||||
|
||||
|
@ -1105,10 +1091,11 @@ func (c *Core) joinRaftSendAnswer(ctx context.Context, sealAccess *seal.Access,
|
|||
}
|
||||
|
||||
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))
|
||||
err = raftBackend.SetupCluster(ctx, raft.SetupOpts{
|
||||
opts := raft.SetupOpts{
|
||||
TLSKeyring: answerResp.Data.TLSKeyring,
|
||||
ClusterListener: c.getClusterListener(),
|
||||
})
|
||||
}
|
||||
err = raftBackend.SetupCluster(ctx, opts)
|
||||
if err != nil {
|
||||
return errwrap.Wrapf("failed to setup raft cluster: {{err}}", err)
|
||||
}
|
||||
|
@ -1116,6 +1103,24 @@ func (c *Core) joinRaftSendAnswer(ctx context.Context, sealAccess *seal.Access,
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *Core) loadAutopilotConfiguration(ctx context.Context) (*raft.AutopilotConfig, error) {
|
||||
var autopilotConfig *raft.AutopilotConfig
|
||||
entry, err := c.barrier.Get(ctx, raftAutopilotConfigurationStoragePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if entry == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if err := jsonutil.DecodeJSON(entry.Value, &autopilotConfig); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return autopilotConfig, nil
|
||||
}
|
||||
|
||||
// RaftBootstrap performs bootstrapping of a raft cluster if core contains a raft
|
||||
// backend. If raft is not part for the storage or HA storage backend, this
|
||||
// call results in an error.
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/helper/forwarding"
|
||||
"github.com/hashicorp/vault/physical/raft"
|
||||
"github.com/hashicorp/vault/sdk/helper/consts"
|
||||
"github.com/hashicorp/vault/vault/replication"
|
||||
)
|
||||
|
@ -17,7 +18,7 @@ type forwardedRequestRPCServer struct {
|
|||
handler http.Handler
|
||||
perfStandbySlots chan struct{}
|
||||
perfStandbyRepCluster *replication.Cluster
|
||||
raftFollowerStates *raftFollowerStates
|
||||
raftFollowerStates *raft.FollowerStates
|
||||
}
|
||||
|
||||
func (s *forwardedRequestRPCServer) ForwardRequest(ctx context.Context, freq *forwarding.Request) (*forwarding.Response, error) {
|
||||
|
@ -73,7 +74,7 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) (
|
|||
}
|
||||
|
||||
if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil {
|
||||
s.raftFollowerStates.update(in.RaftNodeID, in.RaftAppliedIndex)
|
||||
s.raftFollowerStates.Update(in.RaftNodeID, in.RaftAppliedIndex, in.RaftTerm, in.RaftDesiredSuffrage)
|
||||
}
|
||||
|
||||
reply := &EchoReply{
|
||||
|
@ -116,6 +117,8 @@ func (c *forwardingClient) startHeartbeat() {
|
|||
if !c.core.isRaftHAOnly() {
|
||||
req.RaftAppliedIndex = raftBackend.AppliedIndex()
|
||||
req.RaftNodeID = raftBackend.NodeID()
|
||||
req.RaftTerm = raftBackend.Term()
|
||||
req.RaftDesiredSuffrage = raftBackend.DesiredSuffrage()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -41,10 +41,12 @@ type EchoRequest struct {
|
|||
ClusterAddr string `protobuf:"bytes,2,opt,name=cluster_addr,json=clusterAddr,proto3" json:"cluster_addr,omitempty"`
|
||||
// ClusterAddrs is used to send up a list of cluster addresses to a dr
|
||||
// primary from a dr secondary
|
||||
ClusterAddrs []string `protobuf:"bytes,3,rep,name=cluster_addrs,json=clusterAddrs,proto3" json:"cluster_addrs,omitempty"`
|
||||
RaftAppliedIndex uint64 `protobuf:"varint,4,opt,name=raft_applied_index,json=raftAppliedIndex,proto3" json:"raft_applied_index,omitempty"`
|
||||
RaftNodeID string `protobuf:"bytes,5,opt,name=raft_node_id,json=raftNodeId,proto3" json:"raft_node_id,omitempty"`
|
||||
NodeInfo *NodeInformation `protobuf:"bytes,6,opt,name=node_info,json=nodeInfo,proto3" json:"node_info,omitempty"`
|
||||
ClusterAddrs []string `protobuf:"bytes,3,rep,name=cluster_addrs,json=clusterAddrs,proto3" json:"cluster_addrs,omitempty"`
|
||||
RaftAppliedIndex uint64 `protobuf:"varint,4,opt,name=raft_applied_index,json=raftAppliedIndex,proto3" json:"raft_applied_index,omitempty"`
|
||||
RaftNodeID string `protobuf:"bytes,5,opt,name=raft_node_id,json=raftNodeId,proto3" json:"raft_node_id,omitempty"`
|
||||
NodeInfo *NodeInformation `protobuf:"bytes,6,opt,name=node_info,json=nodeInfo,proto3" json:"node_info,omitempty"`
|
||||
RaftTerm uint64 `protobuf:"varint,7,opt,name=raft_term,json=raftTerm,proto3" json:"raft_term,omitempty"`
|
||||
RaftDesiredSuffrage string `protobuf:"bytes,8,opt,name=raft_desired_suffrage,json=raftDesiredSuffrage,proto3" json:"raft_desired_suffrage,omitempty"`
|
||||
}
|
||||
|
||||
func (x *EchoRequest) Reset() {
|
||||
|
@ -121,6 +123,20 @@ func (x *EchoRequest) GetNodeInfo() *NodeInformation {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (x *EchoRequest) GetRaftTerm() uint64 {
|
||||
if x != nil {
|
||||
return x.RaftTerm
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (x *EchoRequest) GetRaftDesiredSuffrage() string {
|
||||
if x != nil {
|
||||
return x.RaftDesiredSuffrage
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
type EchoReply struct {
|
||||
state protoimpl.MessageState
|
||||
sizeCache protoimpl.SizeCache
|
||||
|
@ -490,8 +506,8 @@ var file_vault_request_forwarding_service_proto_rawDesc = []byte{
|
|||
0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x69,
|
||||
0x63, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x05, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x1a,
|
||||
0x1d, 0x68, 0x65, 0x6c, 0x70, 0x65, 0x72, 0x2f, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69,
|
||||
0x6e, 0x67, 0x2f, 0x74, 0x79, 0x70, 0x65, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xf4,
|
||||
0x01, 0x0a, 0x0b, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x18,
|
||||
0x6e, 0x67, 0x2f, 0x74, 0x79, 0x70, 0x65, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xc5,
|
||||
0x02, 0x0a, 0x0b, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x18,
|
||||
0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52,
|
||||
0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x6c, 0x75, 0x73,
|
||||
0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b,
|
||||
|
@ -506,73 +522,78 @@ var file_vault_request_forwarding_service_proto_rawDesc = []byte{
|
|||
0x12, 0x33, 0x0a, 0x09, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x06, 0x20,
|
||||
0x01, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x4e, 0x6f, 0x64, 0x65,
|
||||
0x49, 0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x6e, 0x6f, 0x64,
|
||||
0x65, 0x49, 0x6e, 0x66, 0x6f, 0x22, 0xfc, 0x01, 0x0a, 0x09, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65,
|
||||
0x70, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01,
|
||||
0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x23, 0x0a,
|
||||
0x0d, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x73, 0x18, 0x02,
|
||||
0x20, 0x03, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x41, 0x64, 0x64,
|
||||
0x72, 0x73, 0x12, 0x2b, 0x0a, 0x11, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f,
|
||||
0x6e, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x72,
|
||||
0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12,
|
||||
0x2c, 0x0a, 0x12, 0x72, 0x61, 0x66, 0x74, 0x5f, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x65, 0x64, 0x5f,
|
||||
0x69, 0x6e, 0x64, 0x65, 0x78, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x10, 0x72, 0x61, 0x66,
|
||||
0x74, 0x41, 0x70, 0x70, 0x6c, 0x69, 0x65, 0x64, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x12, 0x20, 0x0a,
|
||||
0x0c, 0x72, 0x61, 0x66, 0x74, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x05, 0x20,
|
||||
0x01, 0x28, 0x09, 0x52, 0x0a, 0x72, 0x61, 0x66, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x49, 0x64, 0x12,
|
||||
0x33, 0x0a, 0x09, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x06, 0x20, 0x01,
|
||||
0x28, 0x0b, 0x32, 0x16, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x4e, 0x6f, 0x64, 0x65, 0x49,
|
||||
0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x6e, 0x6f, 0x64, 0x65,
|
||||
0x49, 0x6e, 0x66, 0x6f, 0x22, 0xa9, 0x01, 0x0a, 0x0f, 0x4e, 0x6f, 0x64, 0x65, 0x49, 0x6e, 0x66,
|
||||
0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x6c, 0x75, 0x73,
|
||||
0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b,
|
||||
0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08, 0x61,
|
||||
0x70, 0x69, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x61,
|
||||
0x70, 0x69, 0x41, 0x64, 0x64, 0x72, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x18, 0x03,
|
||||
0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x12, 0x17, 0x0a, 0x07, 0x6e, 0x6f,
|
||||
0x64, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f, 0x64,
|
||||
0x65, 0x49, 0x64, 0x12, 0x2b, 0x0a, 0x11, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69,
|
||||
0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10,
|
||||
0x65, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x1b, 0x0a, 0x09, 0x72, 0x61, 0x66, 0x74, 0x5f, 0x74, 0x65,
|
||||
0x72, 0x6d, 0x18, 0x07, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x72, 0x61, 0x66, 0x74, 0x54, 0x65,
|
||||
0x72, 0x6d, 0x12, 0x32, 0x0a, 0x15, 0x72, 0x61, 0x66, 0x74, 0x5f, 0x64, 0x65, 0x73, 0x69, 0x72,
|
||||
0x65, 0x64, 0x5f, 0x73, 0x75, 0x66, 0x66, 0x72, 0x61, 0x67, 0x65, 0x18, 0x08, 0x20, 0x01, 0x28,
|
||||
0x09, 0x52, 0x13, 0x72, 0x61, 0x66, 0x74, 0x44, 0x65, 0x73, 0x69, 0x72, 0x65, 0x64, 0x53, 0x75,
|
||||
0x66, 0x66, 0x72, 0x61, 0x67, 0x65, 0x22, 0xfc, 0x01, 0x0a, 0x09, 0x45, 0x63, 0x68, 0x6f, 0x52,
|
||||
0x65, 0x70, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18,
|
||||
0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x23,
|
||||
0x0a, 0x0d, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x73, 0x18,
|
||||
0x02, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x41, 0x64,
|
||||
0x64, 0x72, 0x73, 0x12, 0x2b, 0x0a, 0x11, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69,
|
||||
0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10,
|
||||
0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x53, 0x74, 0x61, 0x74, 0x65,
|
||||
0x22, 0x49, 0x0a, 0x09, 0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x4b, 0x65, 0x79, 0x12, 0x12, 0x0a,
|
||||
0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x79, 0x70,
|
||||
0x65, 0x12, 0x0c, 0x0a, 0x01, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x01, 0x78, 0x12,
|
||||
0x0c, 0x0a, 0x01, 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x01, 0x79, 0x12, 0x0c, 0x0a,
|
||||
0x01, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x01, 0x64, 0x22, 0x1a, 0x0a, 0x18, 0x50,
|
||||
0x65, 0x72, 0x66, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69,
|
||||
0x6f, 0x6e, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0xe9, 0x01, 0x0a, 0x1b, 0x50, 0x65, 0x72, 0x66,
|
||||
0x53, 0x74, 0x61, 0x6e, 0x64, 0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x52,
|
||||
0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20,
|
||||
0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x75, 0x73, 0x74,
|
||||
0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x75,
|
||||
0x73, 0x74, 0x65, 0x72, 0x49, 0x64, 0x12, 0x30, 0x0a, 0x14, 0x70, 0x72, 0x69, 0x6d, 0x61, 0x72,
|
||||
0x79, 0x5f, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x03,
|
||||
0x20, 0x01, 0x28, 0x09, 0x52, 0x12, 0x70, 0x72, 0x69, 0x6d, 0x61, 0x72, 0x79, 0x43, 0x6c, 0x75,
|
||||
0x73, 0x74, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x12, 0x17, 0x0a, 0x07, 0x63, 0x61, 0x5f, 0x63,
|
||||
0x65, 0x72, 0x74, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x06, 0x63, 0x61, 0x43, 0x65, 0x72,
|
||||
0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x63, 0x65, 0x72, 0x74,
|
||||
0x18, 0x05, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0a, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x43, 0x65,
|
||||
0x72, 0x74, 0x12, 0x2f, 0x0a, 0x0a, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x6b, 0x65, 0x79,
|
||||
0x18, 0x06, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x10, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x43,
|
||||
0x6c, 0x69, 0x65, 0x6e, 0x74, 0x4b, 0x65, 0x79, 0x52, 0x09, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74,
|
||||
0x4b, 0x65, 0x79, 0x32, 0xf0, 0x01, 0x0a, 0x11, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x46,
|
||||
0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x12, 0x3d, 0x0a, 0x0e, 0x46, 0x6f, 0x72,
|
||||
0x77, 0x61, 0x72, 0x64, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x13, 0x2e, 0x66, 0x6f,
|
||||
0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x2e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
|
||||
0x1a, 0x14, 0x2e, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x2e, 0x52, 0x65,
|
||||
0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x2e, 0x0a, 0x04, 0x45, 0x63, 0x68, 0x6f,
|
||||
0x12, 0x12, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65, 0x71,
|
||||
0x75, 0x65, 0x73, 0x74, 0x1a, 0x10, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x45, 0x63, 0x68,
|
||||
0x6f, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x6c, 0x0a, 0x21, 0x50, 0x65, 0x72, 0x66,
|
||||
0x6f, 0x72, 0x6d, 0x61, 0x6e, 0x63, 0x65, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x62, 0x79, 0x45, 0x6c,
|
||||
0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1f, 0x2e,
|
||||
0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x50, 0x65, 0x72, 0x66, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x62,
|
||||
0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x1a, 0x22,
|
||||
0x12, 0x2c, 0x0a, 0x12, 0x72, 0x61, 0x66, 0x74, 0x5f, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x65, 0x64,
|
||||
0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x10, 0x72, 0x61,
|
||||
0x66, 0x74, 0x41, 0x70, 0x70, 0x6c, 0x69, 0x65, 0x64, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x12, 0x20,
|
||||
0x0a, 0x0c, 0x72, 0x61, 0x66, 0x74, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x05,
|
||||
0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x72, 0x61, 0x66, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x49, 0x64,
|
||||
0x12, 0x33, 0x0a, 0x09, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x06, 0x20,
|
||||
0x01, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x4e, 0x6f, 0x64, 0x65,
|
||||
0x49, 0x6e, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x6e, 0x6f, 0x64,
|
||||
0x65, 0x49, 0x6e, 0x66, 0x6f, 0x22, 0xa9, 0x01, 0x0a, 0x0f, 0x4e, 0x6f, 0x64, 0x65, 0x49, 0x6e,
|
||||
0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x6c, 0x75,
|
||||
0x73, 0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52,
|
||||
0x0b, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x12, 0x19, 0x0a, 0x08,
|
||||
0x61, 0x70, 0x69, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07,
|
||||
0x61, 0x70, 0x69, 0x41, 0x64, 0x64, 0x72, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x18,
|
||||
0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x12, 0x17, 0x0a, 0x07, 0x6e,
|
||||
0x6f, 0x64, 0x65, 0x5f, 0x69, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6e, 0x6f,
|
||||
0x64, 0x65, 0x49, 0x64, 0x12, 0x2b, 0x0a, 0x11, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74,
|
||||
0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52,
|
||||
0x10, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x53, 0x74, 0x61, 0x74,
|
||||
0x65, 0x22, 0x49, 0x0a, 0x09, 0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x4b, 0x65, 0x79, 0x12, 0x12,
|
||||
0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x79,
|
||||
0x70, 0x65, 0x12, 0x0c, 0x0a, 0x01, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x01, 0x78,
|
||||
0x12, 0x0c, 0x0a, 0x01, 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x01, 0x79, 0x12, 0x0c,
|
||||
0x0a, 0x01, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x01, 0x64, 0x22, 0x1a, 0x0a, 0x18,
|
||||
0x50, 0x65, 0x72, 0x66, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74,
|
||||
0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0xe9, 0x01, 0x0a, 0x1b, 0x50, 0x65, 0x72,
|
||||
0x66, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e,
|
||||
0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01,
|
||||
0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x75, 0x73,
|
||||
0x74, 0x65, 0x72, 0x5f, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c,
|
||||
0x75, 0x73, 0x74, 0x65, 0x72, 0x49, 0x64, 0x12, 0x30, 0x0a, 0x14, 0x70, 0x72, 0x69, 0x6d, 0x61,
|
||||
0x72, 0x79, 0x5f, 0x63, 0x6c, 0x75, 0x73, 0x74, 0x65, 0x72, 0x5f, 0x61, 0x64, 0x64, 0x72, 0x18,
|
||||
0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x12, 0x70, 0x72, 0x69, 0x6d, 0x61, 0x72, 0x79, 0x43, 0x6c,
|
||||
0x75, 0x73, 0x74, 0x65, 0x72, 0x41, 0x64, 0x64, 0x72, 0x12, 0x17, 0x0a, 0x07, 0x63, 0x61, 0x5f,
|
||||
0x63, 0x65, 0x72, 0x74, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x06, 0x63, 0x61, 0x43, 0x65,
|
||||
0x72, 0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x63, 0x65, 0x72,
|
||||
0x74, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x0a, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x43,
|
||||
0x65, 0x72, 0x74, 0x12, 0x2f, 0x0a, 0x0a, 0x63, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x5f, 0x6b, 0x65,
|
||||
0x79, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x10, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e,
|
||||
0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x4b, 0x65, 0x79, 0x52, 0x09, 0x63, 0x6c, 0x69, 0x65, 0x6e,
|
||||
0x74, 0x4b, 0x65, 0x79, 0x32, 0xf0, 0x01, 0x0a, 0x11, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
|
||||
0x46, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x12, 0x3d, 0x0a, 0x0e, 0x46, 0x6f,
|
||||
0x72, 0x77, 0x61, 0x72, 0x64, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x13, 0x2e, 0x66,
|
||||
0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x2e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73,
|
||||
0x74, 0x1a, 0x14, 0x2e, 0x66, 0x6f, 0x72, 0x77, 0x61, 0x72, 0x64, 0x69, 0x6e, 0x67, 0x2e, 0x52,
|
||||
0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x12, 0x2e, 0x0a, 0x04, 0x45, 0x63, 0x68,
|
||||
0x6f, 0x12, 0x12, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x45, 0x63, 0x68, 0x6f, 0x52, 0x65,
|
||||
0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x10, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x45, 0x63,
|
||||
0x68, 0x6f, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x6c, 0x0a, 0x21, 0x50, 0x65, 0x72,
|
||||
0x66, 0x6f, 0x72, 0x6d, 0x61, 0x6e, 0x63, 0x65, 0x53, 0x74, 0x61, 0x6e, 0x64, 0x62, 0x79, 0x45,
|
||||
0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1f,
|
||||
0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x50, 0x65, 0x72, 0x66, 0x53, 0x74, 0x61, 0x6e, 0x64,
|
||||
0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e,
|
||||
0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x42, 0x22, 0x5a, 0x20, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
|
||||
0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x68, 0x61, 0x73, 0x68, 0x69, 0x63, 0x6f, 0x72, 0x70, 0x2f, 0x76,
|
||||
0x61, 0x75, 0x6c, 0x74, 0x2f, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74,
|
||||
0x6f, 0x33,
|
||||
0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x1a,
|
||||
0x22, 0x2e, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x2e, 0x50, 0x65, 0x72, 0x66, 0x53, 0x74, 0x61, 0x6e,
|
||||
0x64, 0x62, 0x79, 0x45, 0x6c, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x52, 0x65, 0x73, 0x70, 0x6f,
|
||||
0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x42, 0x22, 0x5a, 0x20, 0x67, 0x69, 0x74, 0x68, 0x75,
|
||||
0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x68, 0x61, 0x73, 0x68, 0x69, 0x63, 0x6f, 0x72, 0x70, 0x2f,
|
||||
0x76, 0x61, 0x75, 0x6c, 0x74, 0x2f, 0x76, 0x61, 0x75, 0x6c, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f,
|
||||
0x74, 0x6f, 0x33,
|
||||
}
|
||||
|
||||
var (
|
||||
|
|
|
@ -18,6 +18,8 @@ message EchoRequest {
|
|||
uint64 raft_applied_index = 4;
|
||||
string raft_node_id = 5;
|
||||
NodeInformation node_info = 6;
|
||||
uint64 raft_term = 7;
|
||||
string raft_desired_suffrage = 8;
|
||||
}
|
||||
|
||||
message EchoReply {
|
||||
|
|
|
@ -1070,7 +1070,7 @@ type TestClusterOptions struct {
|
|||
// core in cluster will have 0, second 1, etc.
|
||||
// If the backend is shared across the cluster (i.e. is not Raft) then it
|
||||
// should return nil when coreIdx != 0.
|
||||
PhysicalFactory func(t testing.T, coreIdx int, logger log.Logger) *PhysicalBackendBundle
|
||||
PhysicalFactory func(t testing.T, coreIdx int, logger log.Logger, conf map[string]interface{}) *PhysicalBackendBundle
|
||||
// FirstCoreNumber is used to assign a unique number to each core within
|
||||
// a multi-cluster setup.
|
||||
FirstCoreNumber int
|
||||
|
@ -1091,6 +1091,8 @@ type TestClusterOptions struct {
|
|||
RaftAddressProvider raftlib.ServerAddressProvider
|
||||
|
||||
CoreMetricSinkProvider func(clusterName string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper)
|
||||
|
||||
PhysicalFactoryConfig map[string]interface{}
|
||||
}
|
||||
|
||||
var DefaultNumCores = 3
|
||||
|
@ -1456,6 +1458,7 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te
|
|||
coreConfig.SecureRandomReader = base.SecureRandomReader
|
||||
coreConfig.DisableSentinelTrace = base.DisableSentinelTrace
|
||||
coreConfig.ClusterName = base.ClusterName
|
||||
coreConfig.DisableAutopilot = base.DisableAutopilot
|
||||
|
||||
if base.BuiltinRegistry != nil {
|
||||
coreConfig.BuiltinRegistry = base.BuiltinRegistry
|
||||
|
@ -1763,7 +1766,7 @@ func (testCluster *TestCluster) newCore(t testing.T, idx int, coreConfig *CoreCo
|
|||
localConfig.Logger = testCluster.Logger.Named(fmt.Sprintf("core%d", idx))
|
||||
}
|
||||
if opts != nil && opts.PhysicalFactory != nil {
|
||||
physBundle := opts.PhysicalFactory(t, idx, localConfig.Logger)
|
||||
physBundle := opts.PhysicalFactory(t, idx, localConfig.Logger, opts.PhysicalFactoryConfig)
|
||||
switch {
|
||||
case physBundle == nil && coreConfig.Physical != nil:
|
||||
case physBundle == nil && coreConfig.Physical == nil:
|
||||
|
@ -1795,6 +1798,7 @@ func (testCluster *TestCluster) newCore(t testing.T, idx int, coreConfig *CoreCo
|
|||
|
||||
if opts != nil && opts.ClusterLayers != nil {
|
||||
localConfig.ClusterNetworkLayer = opts.ClusterLayers.Layers()[idx]
|
||||
localConfig.ClusterAddr = "https://" + localConfig.ClusterNetworkLayer.Listeners()[0].Addr().String()
|
||||
}
|
||||
|
||||
switch {
|
||||
|
|
|
@ -0,0 +1,373 @@
|
|||
Mozilla Public License Version 2.0
|
||||
==================================
|
||||
|
||||
1. Definitions
|
||||
--------------
|
||||
|
||||
1.1. "Contributor"
|
||||
means each individual or legal entity that creates, contributes to
|
||||
the creation of, or owns Covered Software.
|
||||
|
||||
1.2. "Contributor Version"
|
||||
means the combination of the Contributions of others (if any) used
|
||||
by a Contributor and that particular Contributor's Contribution.
|
||||
|
||||
1.3. "Contribution"
|
||||
means Covered Software of a particular Contributor.
|
||||
|
||||
1.4. "Covered Software"
|
||||
means Source Code Form to which the initial Contributor has attached
|
||||
the notice in Exhibit A, the Executable Form of such Source Code
|
||||
Form, and Modifications of such Source Code Form, in each case
|
||||
including portions thereof.
|
||||
|
||||
1.5. "Incompatible With Secondary Licenses"
|
||||
means
|
||||
|
||||
(a) that the initial Contributor has attached the notice described
|
||||
in Exhibit B to the Covered Software; or
|
||||
|
||||
(b) that the Covered Software was made available under the terms of
|
||||
version 1.1 or earlier of the License, but not also under the
|
||||
terms of a Secondary License.
|
||||
|
||||
1.6. "Executable Form"
|
||||
means any form of the work other than Source Code Form.
|
||||
|
||||
1.7. "Larger Work"
|
||||
means a work that combines Covered Software with other material, in
|
||||
a separate file or files, that is not Covered Software.
|
||||
|
||||
1.8. "License"
|
||||
means this document.
|
||||
|
||||
1.9. "Licensable"
|
||||
means having the right to grant, to the maximum extent possible,
|
||||
whether at the time of the initial grant or subsequently, any and
|
||||
all of the rights conveyed by this License.
|
||||
|
||||
1.10. "Modifications"
|
||||
means any of the following:
|
||||
|
||||
(a) any file in Source Code Form that results from an addition to,
|
||||
deletion from, or modification of the contents of Covered
|
||||
Software; or
|
||||
|
||||
(b) any new file in Source Code Form that contains any Covered
|
||||
Software.
|
||||
|
||||
1.11. "Patent Claims" of a Contributor
|
||||
means any patent claim(s), including without limitation, method,
|
||||
process, and apparatus claims, in any patent Licensable by such
|
||||
Contributor that would be infringed, but for the grant of the
|
||||
License, by the making, using, selling, offering for sale, having
|
||||
made, import, or transfer of either its Contributions or its
|
||||
Contributor Version.
|
||||
|
||||
1.12. "Secondary License"
|
||||
means either the GNU General Public License, Version 2.0, the GNU
|
||||
Lesser General Public License, Version 2.1, the GNU Affero General
|
||||
Public License, Version 3.0, or any later versions of those
|
||||
licenses.
|
||||
|
||||
1.13. "Source Code Form"
|
||||
means the form of the work preferred for making modifications.
|
||||
|
||||
1.14. "You" (or "Your")
|
||||
means an individual or a legal entity exercising rights under this
|
||||
License. For legal entities, "You" includes any entity that
|
||||
controls, is controlled by, or is under common control with You. For
|
||||
purposes of this definition, "control" means (a) the power, direct
|
||||
or indirect, to cause the direction or management of such entity,
|
||||
whether by contract or otherwise, or (b) ownership of more than
|
||||
fifty percent (50%) of the outstanding shares or beneficial
|
||||
ownership of such entity.
|
||||
|
||||
2. License Grants and Conditions
|
||||
--------------------------------
|
||||
|
||||
2.1. Grants
|
||||
|
||||
Each Contributor hereby grants You a world-wide, royalty-free,
|
||||
non-exclusive license:
|
||||
|
||||
(a) under intellectual property rights (other than patent or trademark)
|
||||
Licensable by such Contributor to use, reproduce, make available,
|
||||
modify, display, perform, distribute, and otherwise exploit its
|
||||
Contributions, either on an unmodified basis, with Modifications, or
|
||||
as part of a Larger Work; and
|
||||
|
||||
(b) under Patent Claims of such Contributor to make, use, sell, offer
|
||||
for sale, have made, import, and otherwise transfer either its
|
||||
Contributions or its Contributor Version.
|
||||
|
||||
2.2. Effective Date
|
||||
|
||||
The licenses granted in Section 2.1 with respect to any Contribution
|
||||
become effective for each Contribution on the date the Contributor first
|
||||
distributes such Contribution.
|
||||
|
||||
2.3. Limitations on Grant Scope
|
||||
|
||||
The licenses granted in this Section 2 are the only rights granted under
|
||||
this License. No additional rights or licenses will be implied from the
|
||||
distribution or licensing of Covered Software under this License.
|
||||
Notwithstanding Section 2.1(b) above, no patent license is granted by a
|
||||
Contributor:
|
||||
|
||||
(a) for any code that a Contributor has removed from Covered Software;
|
||||
or
|
||||
|
||||
(b) for infringements caused by: (i) Your and any other third party's
|
||||
modifications of Covered Software, or (ii) the combination of its
|
||||
Contributions with other software (except as part of its Contributor
|
||||
Version); or
|
||||
|
||||
(c) under Patent Claims infringed by Covered Software in the absence of
|
||||
its Contributions.
|
||||
|
||||
This License does not grant any rights in the trademarks, service marks,
|
||||
or logos of any Contributor (except as may be necessary to comply with
|
||||
the notice requirements in Section 3.4).
|
||||
|
||||
2.4. Subsequent Licenses
|
||||
|
||||
No Contributor makes additional grants as a result of Your choice to
|
||||
distribute the Covered Software under a subsequent version of this
|
||||
License (see Section 10.2) or under the terms of a Secondary License (if
|
||||
permitted under the terms of Section 3.3).
|
||||
|
||||
2.5. Representation
|
||||
|
||||
Each Contributor represents that the Contributor believes its
|
||||
Contributions are its original creation(s) or it has sufficient rights
|
||||
to grant the rights to its Contributions conveyed by this License.
|
||||
|
||||
2.6. Fair Use
|
||||
|
||||
This License is not intended to limit any rights You have under
|
||||
applicable copyright doctrines of fair use, fair dealing, or other
|
||||
equivalents.
|
||||
|
||||
2.7. Conditions
|
||||
|
||||
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
|
||||
in Section 2.1.
|
||||
|
||||
3. Responsibilities
|
||||
-------------------
|
||||
|
||||
3.1. Distribution of Source Form
|
||||
|
||||
All distribution of Covered Software in Source Code Form, including any
|
||||
Modifications that You create or to which You contribute, must be under
|
||||
the terms of this License. You must inform recipients that the Source
|
||||
Code Form of the Covered Software is governed by the terms of this
|
||||
License, and how they can obtain a copy of this License. You may not
|
||||
attempt to alter or restrict the recipients' rights in the Source Code
|
||||
Form.
|
||||
|
||||
3.2. Distribution of Executable Form
|
||||
|
||||
If You distribute Covered Software in Executable Form then:
|
||||
|
||||
(a) such Covered Software must also be made available in Source Code
|
||||
Form, as described in Section 3.1, and You must inform recipients of
|
||||
the Executable Form how they can obtain a copy of such Source Code
|
||||
Form by reasonable means in a timely manner, at a charge no more
|
||||
than the cost of distribution to the recipient; and
|
||||
|
||||
(b) You may distribute such Executable Form under the terms of this
|
||||
License, or sublicense it under different terms, provided that the
|
||||
license for the Executable Form does not attempt to limit or alter
|
||||
the recipients' rights in the Source Code Form under this License.
|
||||
|
||||
3.3. Distribution of a Larger Work
|
||||
|
||||
You may create and distribute a Larger Work under terms of Your choice,
|
||||
provided that You also comply with the requirements of this License for
|
||||
the Covered Software. If the Larger Work is a combination of Covered
|
||||
Software with a work governed by one or more Secondary Licenses, and the
|
||||
Covered Software is not Incompatible With Secondary Licenses, this
|
||||
License permits You to additionally distribute such Covered Software
|
||||
under the terms of such Secondary License(s), so that the recipient of
|
||||
the Larger Work may, at their option, further distribute the Covered
|
||||
Software under the terms of either this License or such Secondary
|
||||
License(s).
|
||||
|
||||
3.4. Notices
|
||||
|
||||
You may not remove or alter the substance of any license notices
|
||||
(including copyright notices, patent notices, disclaimers of warranty,
|
||||
or limitations of liability) contained within the Source Code Form of
|
||||
the Covered Software, except that You may alter any license notices to
|
||||
the extent required to remedy known factual inaccuracies.
|
||||
|
||||
3.5. Application of Additional Terms
|
||||
|
||||
You may choose to offer, and to charge a fee for, warranty, support,
|
||||
indemnity or liability obligations to one or more recipients of Covered
|
||||
Software. However, You may do so only on Your own behalf, and not on
|
||||
behalf of any Contributor. You must make it absolutely clear that any
|
||||
such warranty, support, indemnity, or liability obligation is offered by
|
||||
You alone, and You hereby agree to indemnify every Contributor for any
|
||||
liability incurred by such Contributor as a result of warranty, support,
|
||||
indemnity or liability terms You offer. You may include additional
|
||||
disclaimers of warranty and limitations of liability specific to any
|
||||
jurisdiction.
|
||||
|
||||
4. Inability to Comply Due to Statute or Regulation
|
||||
---------------------------------------------------
|
||||
|
||||
If it is impossible for You to comply with any of the terms of this
|
||||
License with respect to some or all of the Covered Software due to
|
||||
statute, judicial order, or regulation then You must: (a) comply with
|
||||
the terms of this License to the maximum extent possible; and (b)
|
||||
describe the limitations and the code they affect. Such description must
|
||||
be placed in a text file included with all distributions of the Covered
|
||||
Software under this License. Except to the extent prohibited by statute
|
||||
or regulation, such description must be sufficiently detailed for a
|
||||
recipient of ordinary skill to be able to understand it.
|
||||
|
||||
5. Termination
|
||||
--------------
|
||||
|
||||
5.1. The rights granted under this License will terminate automatically
|
||||
if You fail to comply with any of its terms. However, if You become
|
||||
compliant, then the rights granted under this License from a particular
|
||||
Contributor are reinstated (a) provisionally, unless and until such
|
||||
Contributor explicitly and finally terminates Your grants, and (b) on an
|
||||
ongoing basis, if such Contributor fails to notify You of the
|
||||
non-compliance by some reasonable means prior to 60 days after You have
|
||||
come back into compliance. Moreover, Your grants from a particular
|
||||
Contributor are reinstated on an ongoing basis if such Contributor
|
||||
notifies You of the non-compliance by some reasonable means, this is the
|
||||
first time You have received notice of non-compliance with this License
|
||||
from such Contributor, and You become compliant prior to 30 days after
|
||||
Your receipt of the notice.
|
||||
|
||||
5.2. If You initiate litigation against any entity by asserting a patent
|
||||
infringement claim (excluding declaratory judgment actions,
|
||||
counter-claims, and cross-claims) alleging that a Contributor Version
|
||||
directly or indirectly infringes any patent, then the rights granted to
|
||||
You by any and all Contributors for the Covered Software under Section
|
||||
2.1 of this License shall terminate.
|
||||
|
||||
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
|
||||
end user license agreements (excluding distributors and resellers) which
|
||||
have been validly granted by You or Your distributors under this License
|
||||
prior to termination shall survive termination.
|
||||
|
||||
************************************************************************
|
||||
* *
|
||||
* 6. Disclaimer of Warranty *
|
||||
* ------------------------- *
|
||||
* *
|
||||
* Covered Software is provided under this License on an "as is" *
|
||||
* basis, without warranty of any kind, either expressed, implied, or *
|
||||
* statutory, including, without limitation, warranties that the *
|
||||
* Covered Software is free of defects, merchantable, fit for a *
|
||||
* particular purpose or non-infringing. The entire risk as to the *
|
||||
* quality and performance of the Covered Software is with You. *
|
||||
* Should any Covered Software prove defective in any respect, You *
|
||||
* (not any Contributor) assume the cost of any necessary servicing, *
|
||||
* repair, or correction. This disclaimer of warranty constitutes an *
|
||||
* essential part of this License. No use of any Covered Software is *
|
||||
* authorized under this License except under this disclaimer. *
|
||||
* *
|
||||
************************************************************************
|
||||
|
||||
************************************************************************
|
||||
* *
|
||||
* 7. Limitation of Liability *
|
||||
* -------------------------- *
|
||||
* *
|
||||
* Under no circumstances and under no legal theory, whether tort *
|
||||
* (including negligence), contract, or otherwise, shall any *
|
||||
* Contributor, or anyone who distributes Covered Software as *
|
||||
* permitted above, be liable to You for any direct, indirect, *
|
||||
* special, incidental, or consequential damages of any character *
|
||||
* including, without limitation, damages for lost profits, loss of *
|
||||
* goodwill, work stoppage, computer failure or malfunction, or any *
|
||||
* and all other commercial damages or losses, even if such party *
|
||||
* shall have been informed of the possibility of such damages. This *
|
||||
* limitation of liability shall not apply to liability for death or *
|
||||
* personal injury resulting from such party's negligence to the *
|
||||
* extent applicable law prohibits such limitation. Some *
|
||||
* jurisdictions do not allow the exclusion or limitation of *
|
||||
* incidental or consequential damages, so this exclusion and *
|
||||
* limitation may not apply to You. *
|
||||
* *
|
||||
************************************************************************
|
||||
|
||||
8. Litigation
|
||||
-------------
|
||||
|
||||
Any litigation relating to this License may be brought only in the
|
||||
courts of a jurisdiction where the defendant maintains its principal
|
||||
place of business and such litigation shall be governed by laws of that
|
||||
jurisdiction, without reference to its conflict-of-law provisions.
|
||||
Nothing in this Section shall prevent a party's ability to bring
|
||||
cross-claims or counter-claims.
|
||||
|
||||
9. Miscellaneous
|
||||
----------------
|
||||
|
||||
This License represents the complete agreement concerning the subject
|
||||
matter hereof. If any provision of this License is held to be
|
||||
unenforceable, such provision shall be reformed only to the extent
|
||||
necessary to make it enforceable. Any law or regulation which provides
|
||||
that the language of a contract shall be construed against the drafter
|
||||
shall not be used to construe this License against a Contributor.
|
||||
|
||||
10. Versions of the License
|
||||
---------------------------
|
||||
|
||||
10.1. New Versions
|
||||
|
||||
Mozilla Foundation is the license steward. Except as provided in Section
|
||||
10.3, no one other than the license steward has the right to modify or
|
||||
publish new versions of this License. Each version will be given a
|
||||
distinguishing version number.
|
||||
|
||||
10.2. Effect of New Versions
|
||||
|
||||
You may distribute the Covered Software under the terms of the version
|
||||
of the License under which You originally received the Covered Software,
|
||||
or under the terms of any subsequent version published by the license
|
||||
steward.
|
||||
|
||||
10.3. Modified Versions
|
||||
|
||||
If you create software not governed by this License, and you want to
|
||||
create a new license for such software, you may create and use a
|
||||
modified version of this License if you rename the license and remove
|
||||
any references to the name of the license steward (except to note that
|
||||
such modified license differs from this License).
|
||||
|
||||
10.4. Distributing Source Code Form that is Incompatible With Secondary
|
||||
Licenses
|
||||
|
||||
If You choose to distribute Source Code Form that is Incompatible With
|
||||
Secondary Licenses under the terms of this version of the License, the
|
||||
notice described in Exhibit B of this License must be attached.
|
||||
|
||||
Exhibit A - Source Code Form License Notice
|
||||
-------------------------------------------
|
||||
|
||||
This Source Code Form is subject to the terms of the Mozilla Public
|
||||
License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
If it is not possible or desirable to put the notice in a particular
|
||||
file, then You may include the notice in a location (such as a LICENSE
|
||||
file in a relevant directory) where a recipient would be likely to look
|
||||
for such a notice.
|
||||
|
||||
You may add additional accurate notices of copyright ownership.
|
||||
|
||||
Exhibit B - "Incompatible With Secondary Licenses" Notice
|
||||
---------------------------------------------------------
|
||||
|
||||
This Source Code Form is "Incompatible With Secondary Licenses", as
|
||||
defined by the Mozilla Public License, v. 2.0.
|
|
@ -0,0 +1,2 @@
|
|||
# raft-autopilot
|
||||
Raft Autopilot
|
|
@ -0,0 +1,234 @@
|
|||
package autopilot
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
const (
|
||||
// These constants were take from what exists in Consul at the time of module extraction.
|
||||
|
||||
DefaultUpdateInterval = 2 * time.Second
|
||||
DefaultReconcileInterval = 10 * time.Second
|
||||
)
|
||||
|
||||
// Option is an option to be used when creating a new Autopilot instance
|
||||
type Option func(*Autopilot)
|
||||
|
||||
// WithUpdateInterval returns an Option to set the Autopilot instance's
|
||||
// update interval.
|
||||
func WithUpdateInterval(t time.Duration) Option {
|
||||
if t == 0 {
|
||||
t = DefaultUpdateInterval
|
||||
}
|
||||
return func(a *Autopilot) {
|
||||
a.updateInterval = t
|
||||
}
|
||||
}
|
||||
|
||||
// WithReconcileInterval returns an Option to set the Autopilot instance's
|
||||
// reconcile interval.
|
||||
func WithReconcileInterval(t time.Duration) Option {
|
||||
if t == 0 {
|
||||
t = DefaultReconcileInterval
|
||||
}
|
||||
return func(a *Autopilot) {
|
||||
a.reconcileInterval = t
|
||||
}
|
||||
}
|
||||
|
||||
// WithLogger returns an Option to set the Autopilot instance's logger
|
||||
func WithLogger(logger hclog.Logger) Option {
|
||||
if logger == nil {
|
||||
logger = hclog.Default()
|
||||
}
|
||||
|
||||
return func(a *Autopilot) {
|
||||
a.logger = logger.Named("autopilot")
|
||||
}
|
||||
}
|
||||
|
||||
// withTimeProvider returns an Option which overrides and Autopilot instance's
|
||||
// time provider with the given one. This should only be used in tests
|
||||
// as a means of making some time.Time values in an autopilot state deterministic.
|
||||
// For real uses the default runtimeTimeProvider should be used.
|
||||
func withTimeProvider(provider timeProvider) Option {
|
||||
return func(a *Autopilot) {
|
||||
a.time = provider
|
||||
}
|
||||
}
|
||||
|
||||
// WithPromoter returns an option to set the Promoter type that Autpilot will
|
||||
// use. When the option is not given the default StablePromoter from this package
|
||||
// will be used.
|
||||
func WithPromoter(promoter Promoter) Option {
|
||||
if promoter == nil {
|
||||
promoter = DefaultPromoter()
|
||||
}
|
||||
|
||||
return func(a *Autopilot) {
|
||||
a.promoter = promoter
|
||||
}
|
||||
}
|
||||
|
||||
// ExecutionStatus represents the current status of the autopilot background go routines
|
||||
type ExecutionStatus string
|
||||
|
||||
const (
|
||||
NotRunning ExecutionStatus = "not-running"
|
||||
Running ExecutionStatus = "running"
|
||||
ShuttingDown ExecutionStatus = "shutting-down"
|
||||
)
|
||||
|
||||
type execInfo struct {
|
||||
// status is the current state of autopilot executation
|
||||
status ExecutionStatus
|
||||
|
||||
// shutdown is a function that can be execute to shutdown a running
|
||||
// autopilot's go routines.
|
||||
shutdown context.CancelFunc
|
||||
|
||||
// done is a chan that will be closed when the running autopilot go
|
||||
// routines have exited. Technically closing it is the very last
|
||||
// thing done in the go routine but at that point enough state has
|
||||
// been cleaned up that we would then allow it to be started
|
||||
// immediately afterward
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// Autopilot is the type to manage a running Raft instance.
|
||||
//
|
||||
// Each Raft node in the cluster will have a corresponding Autopilot instance but
|
||||
// only 1 Autopilot instance should run at a time in the cluster. So when a node
|
||||
// gains Raft leadership the corresponding Autopilot instance should have it's
|
||||
// Start method called. Then if leadership is lost that node should call the
|
||||
// Stop method on the Autopilot instance.
|
||||
type Autopilot struct {
|
||||
logger hclog.Logger
|
||||
// delegate is used to get information about the system such as Raft server
|
||||
// states, known servers etc.
|
||||
delegate ApplicationIntegration
|
||||
// promoter is used to calculate promotions, demotions and leadership transfers
|
||||
// given a particular autopilot State. The interface also contains methods
|
||||
// for filling in parts of the autopilot state that the core module doesn't
|
||||
// control such as the Ext fields on the Server and State types.
|
||||
promoter Promoter
|
||||
// raft is an interface that implements all the parts of the Raft library interface
|
||||
// that we use. It is an interface to allow for mocking raft during testing.
|
||||
raft Raft
|
||||
// time is an interface with a single method for getting the current time - `Now`.
|
||||
// In some tests this will be the MockTimeProvider which allows tests to be more
|
||||
// deterministic but for running systems this should not be overrided from the
|
||||
// default which is the runtimeTimeProvider and is a small shim around calling
|
||||
// time.Now.
|
||||
time timeProvider
|
||||
|
||||
// reconcileInterval is how long between rounds of performing promotions, demotions
|
||||
// and leadership transfers.
|
||||
reconcileInterval time.Duration
|
||||
|
||||
// updateInterval is the time between the periodic state updates. These periodic
|
||||
// state updates take in known servers from the delegate, request Raft stats be
|
||||
// fetched and pull in other inputs such as the Raft configuration to create
|
||||
// an updated view of the Autopilot State.
|
||||
updateInterval time.Duration
|
||||
|
||||
// state is the structure that autopilot uses to make decisions about what to do.
|
||||
// This field should be considered immutable and no modifications to an existing
|
||||
// state should be made but instead a new state is created and set to this field
|
||||
// while holding the stateLock.
|
||||
state *State
|
||||
// stateLock is meant to only protect the state field. This just prevents
|
||||
// the periodic state update and consumers requesting the autopilot state from
|
||||
// racing.
|
||||
stateLock sync.RWMutex
|
||||
|
||||
// startTime is recorded so that we can make better determinations about server
|
||||
// stability during the initial period of time after autopilot first starts.
|
||||
// If autopilot has just started the default behavior to check if a server is
|
||||
// stable will not work as it will ensure the server has been healthy for
|
||||
// the configured server stabilization time. If that configure time is longer
|
||||
// than the amount of time autopilot has been running you can run into issues
|
||||
// with leadership flapping during some scenarios where a cluster is being
|
||||
// brought up.
|
||||
startTime time.Time
|
||||
|
||||
// removeDeadCh is used to trigger the running autopilot go routines to
|
||||
// find and remove any dead/failed servers
|
||||
removeDeadCh chan struct{}
|
||||
|
||||
// reconcileCh is used to trigger an immediate round of reconciliation.
|
||||
reconcileCh chan struct{}
|
||||
|
||||
// leaderLock implements a cancellable mutex that will be used to ensure
|
||||
// that only one autopilot go routine is the "leader". The leader is
|
||||
// the go routine that is currently responsible for updating the
|
||||
// autopilot state and performing raft promotions/demotions.
|
||||
leaderLock *mutex
|
||||
|
||||
// execution is the information about the most recent autopilot execution.
|
||||
// Start will initialize this with the most recent execution and it will
|
||||
// be updated by Stop and by the go routines being executed when they are
|
||||
// finished.
|
||||
execution *execInfo
|
||||
|
||||
// execLock protects access to the execution field
|
||||
execLock sync.Mutex
|
||||
}
|
||||
|
||||
// New will create a new Autopilot instance utilizing the given Raft and Delegate.
|
||||
// If the WithPromoter option is not provided the default StablePromoter will
|
||||
// be used.
|
||||
func New(raft Raft, delegate ApplicationIntegration, options ...Option) *Autopilot {
|
||||
a := &Autopilot{
|
||||
raft: raft,
|
||||
delegate: delegate,
|
||||
state: &State{},
|
||||
promoter: DefaultPromoter(),
|
||||
logger: hclog.Default().Named("autopilot"),
|
||||
// should this be buffered?
|
||||
removeDeadCh: make(chan struct{}, 1),
|
||||
reconcileInterval: DefaultReconcileInterval,
|
||||
updateInterval: DefaultUpdateInterval,
|
||||
time: &runtimeTimeProvider{},
|
||||
leaderLock: newMutex(),
|
||||
}
|
||||
|
||||
for _, opt := range options {
|
||||
opt(a)
|
||||
}
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
// RemoveDeadServers will trigger an immediate removal of dead/failed servers.
|
||||
func (a *Autopilot) RemoveDeadServers() {
|
||||
select {
|
||||
case a.removeDeadCh <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// GetState retrieves the current autopilot State
|
||||
func (a *Autopilot) GetState() *State {
|
||||
a.stateLock.Lock()
|
||||
defer a.stateLock.Unlock()
|
||||
return a.state
|
||||
}
|
||||
|
||||
// GetServerHealth returns the latest ServerHealth for a given server.
|
||||
// The returned struct should not be modified or else it will im
|
||||
func (a *Autopilot) GetServerHealth(id raft.ServerID) *ServerHealth {
|
||||
state := a.GetState()
|
||||
|
||||
srv, ok := state.Servers[id]
|
||||
if ok {
|
||||
return &srv.Health
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
module github.com/hashicorp/raft-autopilot
|
||||
|
||||
go 1.14
|
||||
|
||||
require (
|
||||
github.com/hashicorp/go-hclog v0.14.1
|
||||
github.com/hashicorp/raft v1.2.0
|
||||
github.com/stretchr/testify v1.6.1
|
||||
go.uber.org/goleak v1.1.10
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58
|
||||
)
|
|
@ -0,0 +1,87 @@
|
|||
github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
|
||||
github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 h1:EFSB7Zo9Eg91v7MJPVsifUysc/wPdN+NOnVe6bWbdBM=
|
||||
github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878/go.mod h1:3AMJUQhVx52RsWOnlkpikZr01T/yAVN2gn0861vByNg=
|
||||
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
|
||||
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
|
||||
github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag=
|
||||
github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys=
|
||||
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
|
||||
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/hashicorp/go-cleanhttp v0.5.0/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
|
||||
github.com/hashicorp/go-hclog v0.9.1 h1:9PZfAcVEvez4yhLH2TBU64/h/z4xlFI80cWXRrxuKuM=
|
||||
github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
|
||||
github.com/hashicorp/go-hclog v0.14.1 h1:nQcJDQwIAGnmoUWp8ubocEX40cCml/17YkF6csQLReU=
|
||||
github.com/hashicorp/go-hclog v0.14.1/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ=
|
||||
github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxBjXVn/J/3+z5/0=
|
||||
github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60=
|
||||
github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI=
|
||||
github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM=
|
||||
github.com/hashicorp/go-retryablehttp v0.5.3/go.mod h1:9B5zBasrRhHXnJnui7y6sL7es7NDiJgTc6Er0maI1Xs=
|
||||
github.com/hashicorp/go-uuid v1.0.0 h1:RS8zrF7PhGwyNPOtxSClXXj9HA8feRnJzgnI1RJCSnM=
|
||||
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
|
||||
github.com/hashicorp/golang-lru v0.5.0 h1:CL2msUPvZTLb5O648aiLNJw3hnBxN2+1Jq8rCOH9wdo=
|
||||
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
|
||||
github.com/hashicorp/raft v1.2.0 h1:mHzHIrF0S91d3A7RPBvuqkgB4d/7oFJZyvf1Q4m7GA0=
|
||||
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
|
||||
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
|
||||
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA=
|
||||
github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
|
||||
github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
|
||||
github.com/mattn/go-isatty v0.0.10 h1:qxFzApOv4WsAL965uUPIsXzAKCZxN2p9UqdhFS4ZW10=
|
||||
github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84=
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
|
||||
github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY=
|
||||
github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc=
|
||||
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_golang v0.9.2/go.mod h1:OsXs2jCmiKlQ1lTBmv21f2mNfw4xf/QclQDMrYNZzcM=
|
||||
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
|
||||
github.com/prometheus/common v0.0.0-20181126121408-4724e9255275/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
|
||||
github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
|
||||
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
|
||||
go.uber.org/goleak v1.1.10 h1:z+mqJhf6ss6BSfSM671tgKyZBFPTTJM+HLxnhPC3wu0=
|
||||
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs=
|
||||
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
|
||||
golang.org/x/net v0.0.0-20181201002055-351d144fa1fc/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f h1:Bl/8QSvNqXvPGPGXa2z5xUTmV7VDcZyvRZ+QQXkXTZQ=
|
||||
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190523142557-0e01d883c5c5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20191008105621-543471e840be h1:QAcqgptGM8IQBC9K/RC4o+O9YmqEm0diQn9QmZw/0mU=
|
||||
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
|
||||
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11 h1:Yq9t9jnGoR+dBuitxdo9l6Q7xh/zOyNnYUtDKaQ3x0E=
|
||||
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
|
||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
This code was taken from the same implementation in a branch from Consul and then
|
||||
had the package updated and the mutex type unexported.
|
||||
*/
|
||||
package autopilot
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
||||
type mutex semaphore.Weighted
|
||||
|
||||
// New returns a Mutex that is ready for use.
|
||||
func newMutex() *mutex {
|
||||
return (*mutex)(semaphore.NewWeighted(1))
|
||||
}
|
||||
|
||||
func (m *mutex) Lock() {
|
||||
_ = (*semaphore.Weighted)(m).Acquire(context.Background(), 1)
|
||||
}
|
||||
|
||||
func (m *mutex) Unlock() {
|
||||
(*semaphore.Weighted)(m).Release(1)
|
||||
}
|
||||
|
||||
// TryLock acquires the mutex, blocking until resources are available or ctx is
|
||||
// done. On success, returns nil. On failure, returns ctx.Err() and leaves the
|
||||
// semaphore unchanged.
|
||||
//
|
||||
// If ctx is already done, Acquire may still succeed without blocking.
|
||||
func (m *mutex) TryLock(ctx context.Context) error {
|
||||
return (*semaphore.Weighted)(m).Acquire(ctx, 1)
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
package autopilot
|
||||
|
||||
//
|
||||
// The methods in this file are all mainly to provide synchronous methods
|
||||
// for Raft operations that would normally return futures.
|
||||
//
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
func requiredQuorum(voters int) int {
|
||||
return (voters / 2) + 1
|
||||
}
|
||||
|
||||
// NumVoters is a helper for calculating the number of voting peers in the
|
||||
// current raft configuration. This function ignores any autopilot state
|
||||
// and will make the calculation based on a newly retrieved Raft configuration.
|
||||
func (a *Autopilot) NumVoters() (int, error) {
|
||||
cfg, err := a.getRaftConfiguration()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
var numVoters int
|
||||
for _, server := range cfg.Servers {
|
||||
if server.Suffrage == raft.Voter {
|
||||
numVoters++
|
||||
}
|
||||
}
|
||||
|
||||
return numVoters, nil
|
||||
}
|
||||
|
||||
// AddServer is a helper for adding a new server to the raft configuration.
|
||||
// This may remove servers with duplicate addresses or ids first and after
|
||||
// its all done will trigger autopilot to remove dead servers if there
|
||||
// are any. Servers added by this method will start in a non-voting
|
||||
// state and later on autopilot will promote them to voting status
|
||||
// if desired by the configured promoter. If too many removals would
|
||||
// be required that would cause leadership loss then an error is returned
|
||||
// instead of performing any Raft configuration changes.
|
||||
func (a *Autopilot) AddServer(s *Server) error {
|
||||
cfg, err := a.getRaftConfiguration()
|
||||
if err != nil {
|
||||
a.logger.Error("failed to get raft configuration", "error", err)
|
||||
return err
|
||||
}
|
||||
|
||||
var existingVoter bool
|
||||
var voterRemovals []raft.ServerID
|
||||
var nonVoterRemovals []raft.ServerID
|
||||
var numVoters int
|
||||
for _, server := range cfg.Servers {
|
||||
if server.Suffrage == raft.Voter {
|
||||
numVoters++
|
||||
}
|
||||
|
||||
if server.Address == s.Address && server.ID == s.ID {
|
||||
// nothing to be done as the addr and ID both already match
|
||||
return nil
|
||||
} else if server.ID == s.ID {
|
||||
// special case for address updates only. In this case we should be
|
||||
// able to update the configuration without have to first remove the server
|
||||
if server.Suffrage == raft.Voter || server.Suffrage == raft.Staging {
|
||||
existingVoter = true
|
||||
}
|
||||
} else if server.Address == s.Address {
|
||||
if server.Suffrage == raft.Voter {
|
||||
voterRemovals = append(voterRemovals, server.ID)
|
||||
} else {
|
||||
nonVoterRemovals = append(nonVoterRemovals, server.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
requiredVoters := requiredQuorum(numVoters)
|
||||
if len(voterRemovals) > numVoters-requiredVoters {
|
||||
return fmt.Errorf("Preventing server addition that would require removal of too many servers and cause cluster instability")
|
||||
}
|
||||
|
||||
for _, id := range voterRemovals {
|
||||
if err := a.removeServer(id); err != nil {
|
||||
return fmt.Errorf("error removing server %q with duplicate address %q: %w", id, s.Address, err)
|
||||
}
|
||||
a.logger.Info("removed server with duplicate address", "address", s.Address)
|
||||
}
|
||||
|
||||
for _, id := range nonVoterRemovals {
|
||||
if err := a.removeServer(id); err != nil {
|
||||
return fmt.Errorf("error removing server %q with duplicate address %q: %w", id, s.Address, err)
|
||||
}
|
||||
a.logger.Info("removed server with duplicate address", "address", s.Address)
|
||||
}
|
||||
|
||||
if existingVoter {
|
||||
if err := a.addVoter(s.ID, s.Address); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := a.addNonVoter(s.ID, s.Address); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger a check to remove dead servers
|
||||
a.RemoveDeadServers()
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveServer is a helper to remove a server from Raft if it
|
||||
// exists in the latest Raft configuration
|
||||
func (a *Autopilot) RemoveServer(id raft.ServerID) error {
|
||||
cfg, err := a.getRaftConfiguration()
|
||||
if err != nil {
|
||||
a.logger.Error("failed to get raft configuration", "error", err)
|
||||
return err
|
||||
}
|
||||
|
||||
// only remove servers currently in the configuration
|
||||
for _, server := range cfg.Servers {
|
||||
if server.ID == id {
|
||||
return a.removeServer(server.ID)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// addNonVoter is a wrapper around calling the AddNonVoter method on the Raft
|
||||
// interface object provided to Autopilot
|
||||
func (a *Autopilot) addNonVoter(id raft.ServerID, addr raft.ServerAddress) error {
|
||||
addFuture := a.raft.AddNonvoter(id, addr, 0, 0)
|
||||
if err := addFuture.Error(); err != nil {
|
||||
a.logger.Error("failed to add raft non-voting peer", "id", id, "address", addr, "error", err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// addVoter is a wrapper around calling the AddVoter method on the Raft
|
||||
// interface object provided to Autopilot
|
||||
func (a *Autopilot) addVoter(id raft.ServerID, addr raft.ServerAddress) error {
|
||||
addFuture := a.raft.AddVoter(id, addr, 0, 0)
|
||||
if err := addFuture.Error(); err != nil {
|
||||
a.logger.Error("failed to add raft voting peer", "id", id, "address", addr, "error", err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *Autopilot) demoteVoter(id raft.ServerID) error {
|
||||
removeFuture := a.raft.DemoteVoter(id, 0, 0)
|
||||
if err := removeFuture.Error(); err != nil {
|
||||
a.logger.Error("failed to demote raft peer", "id", id, "error", err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// removeServer is a wrapper around calling the RemoveServer method on the
|
||||
// Raft interface object provided to Autopilot
|
||||
func (a *Autopilot) removeServer(id raft.ServerID) error {
|
||||
a.logger.Debug("removing server by ID", "id", id)
|
||||
future := a.raft.RemoveServer(id, 0, 0)
|
||||
if err := future.Error(); err != nil {
|
||||
a.logger.Error("failed to remove raft server",
|
||||
"id", id,
|
||||
"error", err,
|
||||
)
|
||||
return err
|
||||
}
|
||||
a.logger.Info("removed server", "id", id)
|
||||
return nil
|
||||
}
|
||||
|
||||
// getRaftConfiguration a wrapper arond calling the GetConfiguration method
|
||||
// on the Raft interface object provided to Autopilot
|
||||
func (a *Autopilot) getRaftConfiguration() (*raft.Configuration, error) {
|
||||
configFuture := a.raft.GetConfiguration()
|
||||
if err := configFuture.Error(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cfg := configFuture.Configuration()
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
// lastTerm will retrieve the raft stats and then pull the last term value out of it
|
||||
func (a *Autopilot) lastTerm() (uint64, error) {
|
||||
return strconv.ParseUint(a.raft.Stats()["last_log_term"], 10, 64)
|
||||
}
|
||||
|
||||
// leadershipTransfer will transfer leadership to the server with the specified id and address
|
||||
func (a *Autopilot) leadershipTransfer(id raft.ServerID, address raft.ServerAddress) error {
|
||||
a.logger.Info("Transferring leadership to new server", "id", id, "address", address)
|
||||
future := a.raft.LeadershipTransferToServer(id, address)
|
||||
return future.Error()
|
||||
}
|
|
@ -0,0 +1,281 @@
|
|||
package autopilot
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
// reconcile calculates and then applies promotions and demotions
|
||||
func (a *Autopilot) reconcile() error {
|
||||
conf := a.delegate.AutopilotConfig()
|
||||
if conf == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// grab the current state while locked
|
||||
a.stateLock.Lock()
|
||||
state := a.state
|
||||
a.stateLock.Unlock()
|
||||
|
||||
if state == nil || state.Leader == "" {
|
||||
return fmt.Errorf("Cannote reconcile Raft server voting rights without a valid autopilot state")
|
||||
}
|
||||
|
||||
// have the promoter calculate the required Raft changeset.
|
||||
changes := a.promoter.CalculatePromotionsAndDemotions(conf, state)
|
||||
|
||||
// apply the promotions, if we did apply any then stop here
|
||||
// as we do not want to apply the demotions at the same time
|
||||
// as a means of preventing cluster instability.
|
||||
if done, err := a.applyPromotions(state, changes); done {
|
||||
return err
|
||||
}
|
||||
|
||||
// apply the demotions, if we did apply any then stop here
|
||||
// as we do not want to transition leadership and do demotions
|
||||
// at the same time. This is a preventative measure to maintain
|
||||
// cluster stability.
|
||||
if done, err := a.applyDemotions(state, changes); done {
|
||||
return err
|
||||
}
|
||||
|
||||
// if no leadership transfer is desired then we can exit the method now.
|
||||
if changes.Leader == "" || changes.Leader == state.Leader {
|
||||
return nil
|
||||
}
|
||||
|
||||
// lookup the server we want to transfer leadership to
|
||||
srv, ok := state.Servers[changes.Leader]
|
||||
if !ok {
|
||||
return fmt.Errorf("cannot transfer leadership to an unknown server with ID %s", changes.Leader)
|
||||
}
|
||||
|
||||
// perform the leadership transfer
|
||||
return a.leadershipTransfer(changes.Leader, srv.Server.Address)
|
||||
}
|
||||
|
||||
// applyPromotions will apply all the promotions in the RaftChanges parameter.
|
||||
//
|
||||
// IDs in the change set will be ignored if:
|
||||
// * The server isn't tracked in the provided state
|
||||
// * The server already has voting rights
|
||||
// * The server is not healthy
|
||||
//
|
||||
// If any servers were promoted this function returns true for the bool value.
|
||||
func (a *Autopilot) applyPromotions(state *State, changes RaftChanges) (bool, error) {
|
||||
promoted := false
|
||||
for _, change := range changes.Promotions {
|
||||
srv, found := state.Servers[change]
|
||||
if !found {
|
||||
a.logger.Debug("Ignoring promotion of server as it is not in the autopilot state", "id", change)
|
||||
// this shouldn't be able to happen but is a nice safety measure against the
|
||||
// delegate doing something less than desirable
|
||||
continue
|
||||
}
|
||||
|
||||
if srv.HasVotingRights() {
|
||||
// There is no need to promote as this server is already a voter.
|
||||
// No logging is needed here as this could be a very common case
|
||||
// where the promoter just returns a lists of server ids that should
|
||||
// be voters and non-voters without caring about which ones currently
|
||||
// already are in that state.
|
||||
a.logger.Debug("Not promoting server that already has voting rights", "id", change)
|
||||
continue
|
||||
}
|
||||
|
||||
if !srv.Health.Healthy {
|
||||
// do not promote unhealthy servers
|
||||
a.logger.Debug("Ignoring promotion of unhealthy server", "id", change)
|
||||
continue
|
||||
}
|
||||
|
||||
a.logger.Info("Promoting server", "id", srv.Server.ID, "address", srv.Server.Address, "name", srv.Server.Name)
|
||||
|
||||
if err := a.addVoter(srv.Server.ID, srv.Server.Address); err != nil {
|
||||
return true, fmt.Errorf("failed promoting server %s: %v", srv.Server.ID, err)
|
||||
}
|
||||
|
||||
promoted = true
|
||||
}
|
||||
|
||||
// when we promoted anything we return true to indicate that the promotion/demotion applying
|
||||
// process is finished to prevent promotions and demotions in the same round. This is what
|
||||
// autopilot within Consul used to do so I am keeping the behavior the same for now.
|
||||
return promoted, nil
|
||||
}
|
||||
|
||||
// applyDemotions will apply all the demotions in the RaftChanges parameter.
|
||||
//
|
||||
// IDs in the change set will be ignored if:
|
||||
// * The server isn't tracked in the provided state
|
||||
// * The server does not have voting rights
|
||||
//
|
||||
// If any servers were demoted this function returns true for the bool value.
|
||||
func (a *Autopilot) applyDemotions(state *State, changes RaftChanges) (bool, error) {
|
||||
demoted := false
|
||||
for _, change := range changes.Demotions {
|
||||
srv, found := state.Servers[change]
|
||||
if !found {
|
||||
a.logger.Debug("Ignoring demotion of server as it is not in the autopilot state", "id", change)
|
||||
// this shouldn't be able to happen but is a nice safety measure against the
|
||||
// delegate doing something less than desirable
|
||||
continue
|
||||
}
|
||||
|
||||
if srv.State == RaftNonVoter {
|
||||
// There is no need to demote as this server is already a non-voter.
|
||||
// No logging is needed here as this could be a very common case
|
||||
// where the promoter just returns a lists of server ids that should
|
||||
// be voters and non-voters without caring about which ones currently
|
||||
// already are in that state.
|
||||
a.logger.Debug("Ignoring demotion of server that is already a non-voter", "id", change)
|
||||
continue
|
||||
}
|
||||
|
||||
a.logger.Info("Demoting server", "id", srv.Server.ID, "address", srv.Server.Address, "name", srv.Server.Name)
|
||||
|
||||
if err := a.demoteVoter(srv.Server.ID); err != nil {
|
||||
return true, fmt.Errorf("failed demoting server %s: %v", srv.Server.ID, err)
|
||||
}
|
||||
|
||||
demoted = true
|
||||
}
|
||||
|
||||
// similarly to applyPromotions here we want to stop the process and prevent leadership
|
||||
// transfer when any demotions took place. Basically we want to ensure the cluster is
|
||||
// stable before doing the transfer
|
||||
return demoted, nil
|
||||
}
|
||||
|
||||
// getFailedServers aggregates all of the information about servers that the consuming application believes are in
|
||||
// a failed/left state (indicated by the NodeStatus field on the Server type) as well as stale servers that are
|
||||
// in the raft configuration but not know to the consuming application. This function will do nothing with
|
||||
// that information and is purely to collect the data.
|
||||
func (a *Autopilot) getFailedServers() (*FailedServers, int, error) {
|
||||
staleRaftServers := make(map[raft.ServerID]raft.Server)
|
||||
raftConfig, err := a.getRaftConfiguration()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// Populate a map of all the raft servers. We will
|
||||
// remove some later on from the map leaving us with
|
||||
// just the stale servers.
|
||||
var voters int
|
||||
for _, server := range raftConfig.Servers {
|
||||
staleRaftServers[server.ID] = server
|
||||
|
||||
if server.Suffrage == raft.Voter {
|
||||
voters++
|
||||
}
|
||||
}
|
||||
|
||||
var failed FailedServers
|
||||
for id, srv := range a.delegate.KnownServers() {
|
||||
raftSrv, found := staleRaftServers[id]
|
||||
if found {
|
||||
delete(staleRaftServers, id)
|
||||
}
|
||||
|
||||
if srv.NodeStatus != NodeAlive {
|
||||
if found && raftSrv.Suffrage == raft.Voter {
|
||||
failed.FailedVoters = append(failed.FailedVoters, srv)
|
||||
} else if found {
|
||||
failed.FailedNonVoters = append(failed.FailedNonVoters, srv)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for id, srv := range staleRaftServers {
|
||||
if srv.Suffrage == raft.Voter {
|
||||
failed.StaleVoters = append(failed.StaleVoters, id)
|
||||
} else {
|
||||
failed.StaleNonVoters = append(failed.StaleNonVoters, id)
|
||||
}
|
||||
}
|
||||
|
||||
sort.Slice(failed.StaleNonVoters, func(i, j int) bool {
|
||||
return failed.StaleNonVoters[i] < failed.StaleNonVoters[j]
|
||||
})
|
||||
sort.Slice(failed.StaleVoters, func(i, j int) bool {
|
||||
return failed.StaleVoters[i] < failed.StaleVoters[j]
|
||||
})
|
||||
sort.Slice(failed.FailedNonVoters, func(i, j int) bool {
|
||||
return failed.FailedNonVoters[i].ID < failed.FailedNonVoters[j].ID
|
||||
})
|
||||
sort.Slice(failed.FailedVoters, func(i, j int) bool {
|
||||
return failed.FailedVoters[i].ID < failed.FailedVoters[j].ID
|
||||
})
|
||||
|
||||
return &failed, voters, nil
|
||||
}
|
||||
|
||||
// pruneDeadServers will find stale raft servers and failed servers as indicated by the consuming application
|
||||
// and remove them. For stale raft servers this means removing them from the Raft configuration. For failed
|
||||
// servers this means issuing RemoveFailedNode calls to the delegate. All stale/failed non-voters will be
|
||||
// removed first. Then stale voters and finally failed servers. For servers with voting rights we will
|
||||
// cap the number removed so that we do not remove too many at a time and do not remove nodes to the
|
||||
// point where the number of voters would be below the MinQuorum value from the autopilot config.
|
||||
// Additionally the delegate will be consulted to determine if all of the removals should be done and
|
||||
// can filter the failed servers listings if need be.
|
||||
func (a *Autopilot) pruneDeadServers() error {
|
||||
conf := a.delegate.AutopilotConfig()
|
||||
if conf == nil || !conf.CleanupDeadServers {
|
||||
return nil
|
||||
}
|
||||
|
||||
state := a.GetState()
|
||||
|
||||
failed, voters, err := a.getFailedServers()
|
||||
if err != nil || failed == nil {
|
||||
return err
|
||||
}
|
||||
|
||||
failed = a.promoter.FilterFailedServerRemovals(conf, state, failed)
|
||||
|
||||
// remove failed non voting servers
|
||||
for _, srv := range failed.FailedNonVoters {
|
||||
a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address)
|
||||
a.delegate.RemoveFailedServer(srv)
|
||||
}
|
||||
|
||||
// remove stale non voters
|
||||
for _, id := range failed.StaleNonVoters {
|
||||
a.logger.Debug("removing stale raft server from configuration", "id", id)
|
||||
if err := a.removeServer(id); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
maxRemoval := (voters - 1) / 2
|
||||
|
||||
for _, id := range failed.StaleVoters {
|
||||
if voters-1 < int(conf.MinQuorum) {
|
||||
a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", id, "min", conf.MinQuorum)
|
||||
} else if maxRemoval < 1 {
|
||||
a.logger.Debug("will not remove server as removal of a majority or servers is not safe", "id", id)
|
||||
} else if err := a.removeServer(id); err != nil {
|
||||
return err
|
||||
} else {
|
||||
maxRemoval--
|
||||
voters--
|
||||
}
|
||||
}
|
||||
|
||||
for _, srv := range failed.FailedVoters {
|
||||
if voters-1 < int(conf.MinQuorum) {
|
||||
a.logger.Debug("will not remove server as it would leave less voters than the minimum number allowed", "id", srv.ID, "min", conf.MinQuorum)
|
||||
} else if maxRemoval < 1 {
|
||||
a.logger.Debug("will not remove server as its removal would be unsafe due to affectingas removal of a majority or servers is not safe", "id", srv.ID)
|
||||
} else {
|
||||
a.logger.Info("Attempting removal of failed server node", "id", srv.ID, "name", srv.Name, "address", srv.Address)
|
||||
a.delegate.RemoveFailedServer(srv)
|
||||
maxRemoval--
|
||||
voters--
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,178 @@
|
|||
package autopilot
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Start will launch the go routines in the background to perform Autopilot.
|
||||
// When the context passed in is cancelled or the Stop method is called
|
||||
// then these routines will exit.
|
||||
func (a *Autopilot) Start(ctx context.Context) {
|
||||
a.execLock.Lock()
|
||||
defer a.execLock.Unlock()
|
||||
|
||||
// already running so there is nothing to do
|
||||
if a.execution != nil && a.execution.status == Running {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, shutdown := context.WithCancel(ctx)
|
||||
a.startTime = a.time.Now()
|
||||
|
||||
exec := &execInfo{
|
||||
status: Running,
|
||||
shutdown: shutdown,
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
|
||||
if a.execution == nil || a.execution.status == NotRunning {
|
||||
// In theory with a nil execution or the current execution being in the not
|
||||
// running state, we should be able to immediately gain the leader lock as
|
||||
// nothing else should be running and holding the lock. While true we still
|
||||
// gain the lock to ensure that only one thread may even attempt to be
|
||||
// modifying the autopilot state at once.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
defer cancel()
|
||||
if err := a.leaderLock.TryLock(ctx); err == nil {
|
||||
a.updateState(ctx)
|
||||
a.leaderLock.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
go a.beginExecution(ctx, exec)
|
||||
a.execution = exec
|
||||
return
|
||||
}
|
||||
|
||||
// Stop will terminate the go routines being executed to perform autopilot.
|
||||
func (a *Autopilot) Stop() <-chan struct{} {
|
||||
a.execLock.Lock()
|
||||
defer a.execLock.Unlock()
|
||||
|
||||
// Nothing to do
|
||||
if a.execution == nil || a.execution.status == NotRunning {
|
||||
done := make(chan struct{})
|
||||
close(done)
|
||||
return done
|
||||
}
|
||||
|
||||
a.execution.shutdown()
|
||||
a.execution.status = ShuttingDown
|
||||
return a.execution.done
|
||||
}
|
||||
|
||||
// IsRunning returns the current execution status of the autopilot
|
||||
// go routines as well as a chan which will be closed when the
|
||||
// routines are no longer running
|
||||
func (a *Autopilot) IsRunning() (ExecutionStatus, <-chan struct{}) {
|
||||
a.execLock.Lock()
|
||||
defer a.execLock.Unlock()
|
||||
|
||||
if a.execution == nil || a.execution.status == NotRunning {
|
||||
done := make(chan struct{})
|
||||
close(done)
|
||||
return NotRunning, done
|
||||
}
|
||||
|
||||
return a.execution.status, a.execution.done
|
||||
}
|
||||
|
||||
func (a *Autopilot) finishExecution(exec *execInfo) {
|
||||
// need to gain the lock because if this was the active execution
|
||||
// then these values may be read while they are updated.
|
||||
a.execLock.Lock()
|
||||
defer a.execLock.Unlock()
|
||||
|
||||
exec.shutdown = nil
|
||||
exec.status = NotRunning
|
||||
// this should be the final cleanup task as it is what notifies the rest
|
||||
// of the world that we are now done
|
||||
close(exec.done)
|
||||
exec.done = nil
|
||||
}
|
||||
|
||||
func (a *Autopilot) beginExecution(ctx context.Context, exec *execInfo) {
|
||||
// This will wait for any other go routine to finish executing
|
||||
// before running any code ourselves to prevent any conflicting
|
||||
// activity between the two.
|
||||
if err := a.leaderLock.TryLock(ctx); err != nil {
|
||||
a.finishExecution(exec)
|
||||
return
|
||||
}
|
||||
|
||||
a.logger.Debug("autopilot is now running")
|
||||
|
||||
// autopilot needs to do 3 things
|
||||
//
|
||||
// 1. periodically update the cluster state
|
||||
// 2. periodically check for and perform promotions and demotions
|
||||
// 3. Respond to servers leaving and prune dead servers
|
||||
//
|
||||
// We could attempt to do all of this in a single go routine except that
|
||||
// updating the cluster health could potentially take long enough to impact
|
||||
// the periodicity of the promotions and demotions performed by task 2/3.
|
||||
// So instead this go routine will spawn a second go routine to manage
|
||||
// updating the cluster health in the background. This go routine is still
|
||||
// in control of the overall running status and will not exit until the
|
||||
// child go routine has exited.
|
||||
|
||||
// child go routine for cluster health updating
|
||||
stateUpdaterDone := make(chan struct{})
|
||||
go a.runStateUpdater(ctx, stateUpdaterDone)
|
||||
|
||||
// cleanup for once we are stopped
|
||||
defer func() {
|
||||
// block waiting for our child go routine to also finish
|
||||
<-stateUpdaterDone
|
||||
|
||||
a.logger.Debug("autopilot is now stopped")
|
||||
|
||||
a.finishExecution(exec)
|
||||
a.leaderLock.Unlock()
|
||||
}()
|
||||
|
||||
reconcileTicker := time.NewTicker(a.reconcileInterval)
|
||||
defer reconcileTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-reconcileTicker.C:
|
||||
if err := a.reconcile(); err != nil {
|
||||
a.logger.Error("Failed to reconcile current state with the desired state")
|
||||
}
|
||||
|
||||
if err := a.pruneDeadServers(); err != nil {
|
||||
a.logger.Error("Failed to prune dead servers", "error", err)
|
||||
}
|
||||
case <-a.removeDeadCh:
|
||||
if err := a.pruneDeadServers(); err != nil {
|
||||
a.logger.Error("Failed to prune dead servers", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runStateUpdated will periodically update the autopilot state until the context
|
||||
// passed in is cancelled. When finished the provide done chan will be closed.
|
||||
func (a *Autopilot) runStateUpdater(ctx context.Context, done chan struct{}) {
|
||||
a.logger.Debug("state update routine is now running")
|
||||
defer func() {
|
||||
a.logger.Debug("state update routine is now stopped")
|
||||
close(done)
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(a.updateInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
a.updateState(ctx)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package autopilot
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
func DefaultPromoter() Promoter {
|
||||
return new(StablePromoter)
|
||||
}
|
||||
|
||||
type StablePromoter struct{}
|
||||
|
||||
func (_ *StablePromoter) GetServerExt(_ *Config, srv *ServerState) interface{} {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (_ *StablePromoter) GetStateExt(_ *Config, _ *State) interface{} {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (_ *StablePromoter) GetNodeTypes(_ *Config, s *State) map[raft.ServerID]NodeType {
|
||||
types := make(map[raft.ServerID]NodeType)
|
||||
for id := range s.Servers {
|
||||
// this basic implementation has all nodes be of the "voter" type regardless of
|
||||
// any other settings. That means that in a healthy state all nodes in the cluster
|
||||
// will be a voter.
|
||||
types[id] = NodeVoter
|
||||
}
|
||||
return types
|
||||
}
|
||||
|
||||
func (_ *StablePromoter) FilterFailedServerRemovals(_ *Config, _ *State, failed *FailedServers) *FailedServers {
|
||||
return failed
|
||||
}
|
||||
|
||||
// CalculatePromotionsAndDemotions will return a list of all promotions and demotions to be done as well as the server id of
|
||||
// the desired leader. This particular interface implementation maintains a stable leader and will promote healthy servers
|
||||
// to voting status. It will never change the leader ID nor will it perform demotions.
|
||||
func (_ *StablePromoter) CalculatePromotionsAndDemotions(c *Config, s *State) RaftChanges {
|
||||
var changes RaftChanges
|
||||
|
||||
now := time.Now()
|
||||
minStableDuration := s.ServerStabilizationTime(c)
|
||||
for id, server := range s.Servers {
|
||||
// ignore staging state as they are not ready yet
|
||||
if server.State == RaftNonVoter && server.Health.IsStable(now, minStableDuration) {
|
||||
changes.Promotions = append(changes.Promotions, id)
|
||||
}
|
||||
}
|
||||
|
||||
return changes
|
||||
}
|
|
@ -0,0 +1,398 @@
|
|||
package autopilot
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
// aliveServers will filter the input map of servers and output one with all of the
|
||||
// servers in a Left state removed.
|
||||
func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server {
|
||||
serverMap := make(map[raft.ServerID]*Server)
|
||||
for _, server := range servers {
|
||||
if server.NodeStatus == NodeLeft {
|
||||
continue
|
||||
}
|
||||
|
||||
serverMap[server.ID] = server
|
||||
}
|
||||
|
||||
return serverMap
|
||||
}
|
||||
|
||||
// nextStateInputs is the collection of values that can influence
|
||||
// creation of the next State.
|
||||
type nextStateInputs struct {
|
||||
Now time.Time
|
||||
StartTime time.Time
|
||||
Config *Config
|
||||
RaftConfig *raft.Configuration
|
||||
KnownServers map[raft.ServerID]*Server
|
||||
LatestIndex uint64
|
||||
LastTerm uint64
|
||||
FetchedStats map[raft.ServerID]*ServerStats
|
||||
LeaderID raft.ServerID
|
||||
}
|
||||
|
||||
// gatherNextStateInputs gathers all the information that would be used to
|
||||
// create the new updated state from.
|
||||
//
|
||||
// - Time Providers current time.
|
||||
// - Autopilot Config (needed to determine if the stats should indicate unhealthiness)
|
||||
// - Current state
|
||||
// - Raft Configuration
|
||||
// - Known Servers
|
||||
// - Latest raft index (gathered right before the remote server stats so that they should
|
||||
// be from about the same point in time)
|
||||
// - Stats for all non-left servers
|
||||
func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) {
|
||||
// there are a lot of inputs to computing the next state so they get put into a
|
||||
// struct so that we don't have to return 8 values.
|
||||
inputs := &nextStateInputs{
|
||||
Now: a.time.Now(),
|
||||
StartTime: a.startTime,
|
||||
}
|
||||
|
||||
// grab the latest autopilot configuration
|
||||
config := a.delegate.AutopilotConfig()
|
||||
if config == nil {
|
||||
return nil, fmt.Errorf("delegate did not return an Autopilot configuration")
|
||||
}
|
||||
inputs.Config = config
|
||||
|
||||
// retrieve the raft configuration
|
||||
raftConfig, err := a.getRaftConfiguration()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get the Raft configuration: %w", err)
|
||||
}
|
||||
inputs.RaftConfig = raftConfig
|
||||
|
||||
leader := a.raft.Leader()
|
||||
for _, s := range inputs.RaftConfig.Servers {
|
||||
if s.Address == leader {
|
||||
inputs.LeaderID = s.ID
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if inputs.LeaderID == "" {
|
||||
return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
|
||||
}
|
||||
|
||||
// get the latest Raft index - this should be kept close to the call to
|
||||
// fetch the statistics so that the index values are as close in time as
|
||||
// possible to make the best decision regarding an individual servers
|
||||
// healthiness.
|
||||
inputs.LatestIndex = a.raft.LastIndex()
|
||||
|
||||
term, err := a.lastTerm()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to determine the last Raft term: %w", err)
|
||||
}
|
||||
inputs.LastTerm = term
|
||||
|
||||
// getting the raft configuration could block for a while so now is a good
|
||||
// time to check for context cancellation
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
// get the known servers which may include left/failed ones
|
||||
inputs.KnownServers = a.delegate.KnownServers()
|
||||
|
||||
// in most cases getting the known servers should be quick but as we cannot
|
||||
// account for every potential delegate and prevent them from making
|
||||
// blocking network requests we should probably check the context again.
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
// we only allow the fetch to take place for up to half the health interval
|
||||
// the next health interval will attempt to fetch the stats again but if
|
||||
// we do not see responses within this time then we can assume they are
|
||||
// unhealthy
|
||||
d := inputs.Now.Add(a.updateInterval / 2)
|
||||
fetchCtx, cancel := context.WithDeadline(ctx, d)
|
||||
defer cancel()
|
||||
|
||||
inputs.FetchedStats = a.delegate.FetchServerStats(fetchCtx, aliveServers(inputs.KnownServers))
|
||||
|
||||
// it might be nil but we propagate the ctx.Err just in case our context was
|
||||
// cancelled since the last time we checked.
|
||||
return inputs, ctx.Err()
|
||||
}
|
||||
|
||||
// nextState will gather many inputs about the current state of servers from the
|
||||
// delegate, raft and time provider among other sources and then compute the
|
||||
// next Autopilot state.
|
||||
func (a *Autopilot) nextState(ctx context.Context) (*State, error) {
|
||||
inputs, err := a.gatherNextStateInputs(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
state := a.nextStateWithInputs(inputs)
|
||||
if state.Leader == "" {
|
||||
return nil, fmt.Errorf("Unabled to detect the leader server")
|
||||
}
|
||||
return state, nil
|
||||
}
|
||||
|
||||
// nextStateWithInputs computes the next state given pre-gathered inputs
|
||||
func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State {
|
||||
nextServers := a.nextServers(inputs)
|
||||
|
||||
newState := &State{
|
||||
startTime: inputs.StartTime,
|
||||
Healthy: true,
|
||||
Servers: nextServers,
|
||||
}
|
||||
|
||||
voterCount := 0
|
||||
healthyVoters := 0
|
||||
|
||||
// This loop will
|
||||
// 1. Determine the ID of the leader server and set it in the state
|
||||
// 2. Count the number of voters in the cluster
|
||||
// 3. Count the number of healthy voters in the cluster
|
||||
// 4. Detect unhealthy servers and mark the overall health as false
|
||||
for id, srv := range nextServers {
|
||||
if !srv.Health.Healthy {
|
||||
// any unhealthiness results in overall unhealthiness
|
||||
newState.Healthy = false
|
||||
}
|
||||
|
||||
switch srv.State {
|
||||
case RaftLeader:
|
||||
newState.Leader = id
|
||||
fallthrough
|
||||
case RaftVoter:
|
||||
newState.Voters = append(newState.Voters, id)
|
||||
voterCount++
|
||||
|
||||
if srv.Health.Healthy {
|
||||
healthyVoters++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we have extra healthy voters, update FailureTolerance from its
|
||||
// zero value in the struct.
|
||||
requiredQuorum := requiredQuorum(voterCount)
|
||||
if healthyVoters > requiredQuorum {
|
||||
newState.FailureTolerance = healthyVoters - requiredQuorum
|
||||
}
|
||||
|
||||
// update any promoter specific overall state
|
||||
if newExt := a.promoter.GetStateExt(inputs.Config, newState); newExt != nil {
|
||||
newState.Ext = newExt
|
||||
}
|
||||
|
||||
// update the node types - these are really informational for users to
|
||||
// know how autopilot and the associate promoter algorithms have classed
|
||||
// each server as some promotion algorithms may want to keep certain
|
||||
// servers as non-voters for reasons. The node type then can be used
|
||||
// to indicate why that might be happening.
|
||||
for id, typ := range a.promoter.GetNodeTypes(inputs.Config, newState) {
|
||||
if srv, ok := newState.Servers[id]; ok {
|
||||
srv.Server.NodeType = typ
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the voters list to keep the output stable. This is done near the end
|
||||
// as SortServers may use other parts of the state that were created in
|
||||
// this method and populated in the newState. Requiring output stability
|
||||
// helps make tests easier to manage and means that if you happen to be dumping
|
||||
// the state periodically you shouldn't see things change unless there
|
||||
// are real changes to server health or overall configuration.
|
||||
SortServers(newState.Voters, newState)
|
||||
|
||||
return newState
|
||||
}
|
||||
|
||||
// nextServers will build out the servers map for the next state to be created
|
||||
// from the given inputs. This will take into account all the various sources
|
||||
// of partial state (current state, raft config, application known servers etc.)
|
||||
// and combine them into the final server map.
|
||||
func (a *Autopilot) nextServers(inputs *nextStateInputs) map[raft.ServerID]*ServerState {
|
||||
newServers := make(map[raft.ServerID]*ServerState)
|
||||
|
||||
for _, srv := range inputs.RaftConfig.Servers {
|
||||
state := a.buildServerState(inputs, srv)
|
||||
|
||||
// update any promoter specific information. This isn't done within
|
||||
// buildServerState to keep that function "pure" and not require
|
||||
// mocking for tests
|
||||
if newExt := a.promoter.GetServerExt(inputs.Config, &state); newExt != nil {
|
||||
state.Server.Ext = newExt
|
||||
}
|
||||
|
||||
newServers[srv.ID] = &state
|
||||
}
|
||||
|
||||
return newServers
|
||||
}
|
||||
|
||||
// buildServerState takes all the nextStateInputs and builds out a ServerState
|
||||
// for the given Raft server. This will take into account the raft configuration
|
||||
// existing state, application known servers and recently fetched stats.
|
||||
func (a *Autopilot) buildServerState(inputs *nextStateInputs, srv raft.Server) ServerState {
|
||||
// Note that the ordering of operations in this method are very important.
|
||||
// We are building up the ServerState from the least important sources
|
||||
// and overriding them with more up to date values.
|
||||
|
||||
// build the basic state from the Raft server
|
||||
state := ServerState{
|
||||
Server: Server{
|
||||
ID: srv.ID,
|
||||
Address: srv.Address,
|
||||
},
|
||||
}
|
||||
|
||||
switch srv.Suffrage {
|
||||
case raft.Voter:
|
||||
state.State = RaftVoter
|
||||
case raft.Nonvoter:
|
||||
state.State = RaftNonVoter
|
||||
case raft.Staging:
|
||||
state.State = RaftStaging
|
||||
default:
|
||||
// should be impossible unless the constants in Raft were updated
|
||||
// to have a new state.
|
||||
// TODO (mkeeler) maybe a panic would be better here. The downside is
|
||||
// that it would be hard to catch that in tests when updating the Raft
|
||||
// version.
|
||||
state.State = RaftNone
|
||||
}
|
||||
|
||||
// overwrite the raft state to mark the leader as such instead of just
|
||||
// a regular voter
|
||||
if srv.ID == inputs.LeaderID {
|
||||
state.State = RaftLeader
|
||||
}
|
||||
|
||||
var previousHealthy *bool
|
||||
|
||||
a.stateLock.RLock()
|
||||
// copy some state from an existing server into the new state - most of this
|
||||
// should be overridden soon but at this point we are just building the base.
|
||||
if existing, found := a.state.Servers[srv.ID]; found {
|
||||
state.Stats = existing.Stats
|
||||
state.Health = existing.Health
|
||||
previousHealthy = &state.Health.Healthy
|
||||
|
||||
// it is is important to note that the map values we retrieved this from are
|
||||
// stored by value. Therefore we are modifying a copy of what is in the existing
|
||||
// state and not the actual state itself. We want to ensure that the Address
|
||||
// is what Raft will know about.
|
||||
state.Server = existing.Server
|
||||
state.Server.Address = srv.Address
|
||||
}
|
||||
a.stateLock.RUnlock()
|
||||
|
||||
// pull in the latest information from the applications knowledge of the
|
||||
// server. Mainly we want the NodeStatus & Meta
|
||||
if known, found := inputs.KnownServers[srv.ID]; found {
|
||||
// it is important to note that we are modifying a copy of a Server as the
|
||||
// map we retrieved this from has a non-pointer type value. We definitely
|
||||
// do not want to modify the current known servers but we do want to ensure
|
||||
// that we do not overwrite the Address
|
||||
state.Server = *known
|
||||
state.Server.Address = srv.Address
|
||||
} else {
|
||||
// TODO (mkeeler) do we need a None state. In the previous autopilot code
|
||||
// we would have set this to serf.StatusNone
|
||||
state.Server.NodeStatus = NodeLeft
|
||||
}
|
||||
|
||||
// override the Stats if any where in the fetched results
|
||||
if stats, found := inputs.FetchedStats[srv.ID]; found {
|
||||
state.Stats = *stats
|
||||
}
|
||||
|
||||
// now populate the healthy field given the stats
|
||||
state.Health.Healthy = state.isHealthy(inputs.LastTerm, inputs.LatestIndex, inputs.Config)
|
||||
// overwrite the StableSince field if this is a new server or when
|
||||
// the health status changes. No need for an else as we previously set
|
||||
// it when we overwrote the whole Health structure when finding a
|
||||
// server in the existing state
|
||||
if previousHealthy == nil || *previousHealthy != state.Health.Healthy {
|
||||
state.Health.StableSince = inputs.Now
|
||||
}
|
||||
|
||||
return state
|
||||
}
|
||||
|
||||
// updateState will compute the nextState, set it on the Autopilot instance and
|
||||
// then notify the delegate of the update.
|
||||
func (a *Autopilot) updateState(ctx context.Context) {
|
||||
newState, err := a.nextState(ctx)
|
||||
if err != nil {
|
||||
a.logger.Error("Error when computing next state", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
a.stateLock.Lock()
|
||||
defer a.stateLock.Unlock()
|
||||
a.state = newState
|
||||
a.delegate.NotifyState(newState)
|
||||
}
|
||||
|
||||
// SortServers will take a list of raft ServerIDs and sort it using
|
||||
// information from the State. See the ServerLessThan function for
|
||||
// details about how two servers get compared.
|
||||
func SortServers(ids []raft.ServerID, s *State) {
|
||||
sort.Slice(ids, func(i, j int) bool {
|
||||
return ServerLessThan(ids[i], ids[j], s)
|
||||
})
|
||||
}
|
||||
|
||||
// ServerLessThan will lookup both servers in the given State and return
|
||||
// true if the first id corresponds to a server that is logically less than
|
||||
// lower than, better than etc. the second server. The following criteria
|
||||
// are considered in order of most important to least important
|
||||
//
|
||||
// 1. A Leader server is always less than all others
|
||||
// 2. A voter is less than non voters
|
||||
// 3. Healthy servers are less than unhealthy servers
|
||||
// 4. Servers that have been stable longer are consider less than.
|
||||
func ServerLessThan(id1 raft.ServerID, id2 raft.ServerID, s *State) bool {
|
||||
srvI := s.Servers[id1]
|
||||
srvJ := s.Servers[id2]
|
||||
|
||||
// the leader always comes first
|
||||
if srvI.State == RaftLeader {
|
||||
return true
|
||||
} else if srvJ.State == RaftLeader {
|
||||
return false
|
||||
}
|
||||
|
||||
// voters come before non-voters & staging
|
||||
if srvI.State == RaftVoter && srvJ.State != RaftVoter {
|
||||
return true
|
||||
} else if srvI.State != RaftVoter && srvJ.State == RaftVoter {
|
||||
return false
|
||||
}
|
||||
|
||||
// at this point we know that the raft state of both nodes is roughly
|
||||
// equivalent so we want to now sort based on health
|
||||
if srvI.Health.Healthy == srvJ.Health.Healthy {
|
||||
if srvI.Health.StableSince.Before(srvJ.Health.StableSince) {
|
||||
return srvI.Health.Healthy
|
||||
} else if srvJ.Health.StableSince.Before(srvI.Health.StableSince) {
|
||||
return !srvI.Health.Healthy
|
||||
}
|
||||
|
||||
// with all else equal sort by the IDs
|
||||
return id1 < id2
|
||||
}
|
||||
|
||||
// one of the two isn't healthy. We consider the healthy one as less than
|
||||
// the other. So we return true if server I is healthy and false if it isn't
|
||||
// as we know that server J is healthy and thus should come before server I.
|
||||
return srvI.Health.Healthy
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
package autopilot
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
//go:generate mockery -all -inpkg -case snake -testonly
|
||||
|
||||
// RaftState is the status of a single server in the Raft cluster.
|
||||
type RaftState string
|
||||
|
||||
const (
|
||||
RaftNone RaftState = "none"
|
||||
RaftLeader RaftState = "leader"
|
||||
RaftVoter RaftState = "voter"
|
||||
RaftNonVoter RaftState = "non-voter"
|
||||
RaftStaging RaftState = "staging"
|
||||
)
|
||||
|
||||
func (s RaftState) IsPotentialVoter() bool {
|
||||
switch s {
|
||||
case RaftVoter, RaftStaging, RaftLeader:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// NodeStatus represents the health of a server as know to the autopilot consumer.
|
||||
// This should not take into account Raft health and the server being on a new enough
|
||||
// term and index.
|
||||
type NodeStatus string
|
||||
|
||||
const (
|
||||
NodeUnknown NodeStatus = "unknown"
|
||||
NodeAlive NodeStatus = "alive"
|
||||
NodeFailed NodeStatus = "failed"
|
||||
NodeLeft NodeStatus = "left"
|
||||
)
|
||||
|
||||
type NodeType string
|
||||
|
||||
const (
|
||||
NodeVoter NodeType = "voter"
|
||||
)
|
||||
|
||||
// Config represents all the tunables of autopilot
|
||||
type Config struct {
|
||||
// CleanupDeadServers controls whether to remove dead servers when a new
|
||||
// server is added to the Raft peers.
|
||||
CleanupDeadServers bool
|
||||
|
||||
// LastContactThreshold is the limit on the amount of time a server can go
|
||||
// without leader contact before being considered unhealthy.
|
||||
LastContactThreshold time.Duration
|
||||
|
||||
// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
|
||||
// be behind before being considered unhealthy.
|
||||
MaxTrailingLogs uint64
|
||||
|
||||
// MinQuorum sets the minimum number of servers required in a cluster
|
||||
// before autopilot can prune dead servers.
|
||||
MinQuorum uint
|
||||
|
||||
// ServerStabilizationTime is the minimum amount of time a server must be
|
||||
// in a stable, healthy state before it can be added to the cluster. Only
|
||||
// applicable with Raft protocol version 3 or higher.
|
||||
ServerStabilizationTime time.Duration
|
||||
|
||||
Ext interface{}
|
||||
}
|
||||
|
||||
// Server represents one Raft server
|
||||
type Server struct {
|
||||
// This first set of fields are those that the autopilot delegate
|
||||
// needs to fill in
|
||||
|
||||
ID raft.ServerID
|
||||
Name string
|
||||
Address raft.ServerAddress
|
||||
NodeStatus NodeStatus
|
||||
Version string
|
||||
Meta map[string]string
|
||||
RaftVersion int
|
||||
|
||||
// The remaining fields are those that the promoter
|
||||
// will fill in
|
||||
|
||||
NodeType NodeType
|
||||
Ext interface{}
|
||||
}
|
||||
|
||||
type ServerState struct {
|
||||
Server Server
|
||||
State RaftState
|
||||
Stats ServerStats
|
||||
Health ServerHealth
|
||||
}
|
||||
|
||||
func (s *ServerState) HasVotingRights() bool {
|
||||
return s.State == RaftVoter || s.State == RaftLeader
|
||||
}
|
||||
|
||||
// isHealthy determines whether this ServerState is considered healthy
|
||||
// based on the given Autopilot config
|
||||
func (s *ServerState) isHealthy(lastTerm uint64, leaderLastIndex uint64, conf *Config) bool {
|
||||
if s.Server.NodeStatus != NodeAlive {
|
||||
return false
|
||||
}
|
||||
|
||||
if s.Stats.LastContact > conf.LastContactThreshold || s.Stats.LastContact < 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if s.Stats.LastTerm != lastTerm {
|
||||
return false
|
||||
}
|
||||
|
||||
if leaderLastIndex > conf.MaxTrailingLogs && s.Stats.LastIndex < leaderLastIndex-conf.MaxTrailingLogs {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
type ServerHealth struct {
|
||||
// Healthy is whether or not the server is healthy according to the current
|
||||
// Autopilot config.
|
||||
Healthy bool
|
||||
|
||||
// StableSince is the last time this server's Healthy value changed.
|
||||
StableSince time.Time
|
||||
}
|
||||
|
||||
// IsStable returns true if the ServerState shows a stable, passing state
|
||||
// according to the given AutopilotConfig
|
||||
func (h *ServerHealth) IsStable(now time.Time, minStableDuration time.Duration) bool {
|
||||
if h == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if !h.Healthy {
|
||||
return false
|
||||
}
|
||||
|
||||
if now.Sub(h.StableSince) < minStableDuration {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// ServerStats holds miscellaneous Raft metrics for a server
|
||||
type ServerStats struct {
|
||||
// LastContact is the time since this node's last contact with the leader.
|
||||
LastContact time.Duration
|
||||
|
||||
// LastTerm is the highest leader term this server has a record of in its Raft log.
|
||||
LastTerm uint64
|
||||
|
||||
// LastIndex is the last log index this server has a record of in its Raft log.
|
||||
LastIndex uint64
|
||||
}
|
||||
|
||||
type State struct {
|
||||
startTime time.Time
|
||||
Healthy bool
|
||||
FailureTolerance int
|
||||
Servers map[raft.ServerID]*ServerState
|
||||
Leader raft.ServerID
|
||||
Voters []raft.ServerID
|
||||
Ext interface{}
|
||||
}
|
||||
|
||||
func (s *State) ServerStabilizationTime(c *Config) time.Duration {
|
||||
// Only use the configured stabilization time when autopilot has
|
||||
// been running for 110% of the configured stabilization time.
|
||||
// Before that time we haven't been running long enough to
|
||||
// be able to take these values into account. 110% is pretty
|
||||
// arbitrary but with the default config would prevent the
|
||||
// stabilization time from mattering for an extra second. This
|
||||
// allows for leeway in how quickly we get the healthy RPC responses
|
||||
// after autopilot is started.
|
||||
if time.Since(s.startTime) > (c.ServerStabilizationTime*110)/100 {
|
||||
return c.ServerStabilizationTime
|
||||
}
|
||||
|
||||
// ignore stabilization time if autopilot hasn't been running long enough
|
||||
// to be tracking any server long enough to meet that requirement
|
||||
return 0
|
||||
}
|
||||
|
||||
// Raft is the interface of all the methods on the Raft type that autopilot needs to function. Autopilot will
|
||||
// take in an interface for Raft instead of a concrete type to allow for dependency injection in tests.
|
||||
type Raft interface {
|
||||
AddNonvoter(id raft.ServerID, address raft.ServerAddress, prevIndex uint64, timeout time.Duration) raft.IndexFuture
|
||||
AddVoter(id raft.ServerID, address raft.ServerAddress, prevIndex uint64, timeout time.Duration) raft.IndexFuture
|
||||
DemoteVoter(id raft.ServerID, prevIndex uint64, timeout time.Duration) raft.IndexFuture
|
||||
LastIndex() uint64
|
||||
Leader() raft.ServerAddress
|
||||
GetConfiguration() raft.ConfigurationFuture
|
||||
RemoveServer(id raft.ServerID, prevIndex uint64, timeout time.Duration) raft.IndexFuture
|
||||
Stats() map[string]string
|
||||
LeadershipTransferToServer(id raft.ServerID, address raft.ServerAddress) raft.Future
|
||||
}
|
||||
|
||||
type ApplicationIntegration interface {
|
||||
// AutopilotConfig is used to retrieve the latest configuration from the delegate
|
||||
AutopilotConfig() *Config
|
||||
|
||||
// NotifyState will be called when the autopilot state is updated. The application may choose to emit metrics
|
||||
// or perform other actions based on this information.
|
||||
NotifyState(*State)
|
||||
|
||||
// FetchServerStats will be called to request the application fetch the ServerStats out of band. Usually this
|
||||
// will require an RPC to each server.
|
||||
FetchServerStats(context.Context, map[raft.ServerID]*Server) map[raft.ServerID]*ServerStats
|
||||
|
||||
// KnownServers fetchs the list of servers as known to the application
|
||||
KnownServers() map[raft.ServerID]*Server
|
||||
|
||||
// RemoveFailedServer notifies the application to forcefully remove the server in the failed state
|
||||
// It is expected that this returns nearly immediately so if a longer running operation needs to be
|
||||
// performed then the Delegate implementation should spawn a go routine itself.
|
||||
RemoveFailedServer(*Server)
|
||||
}
|
||||
|
||||
type RaftChanges struct {
|
||||
Promotions []raft.ServerID
|
||||
Demotions []raft.ServerID
|
||||
Leader raft.ServerID
|
||||
}
|
||||
|
||||
type FailedServers struct {
|
||||
// StaleNonVoters are the ids of those server in the raft configuration as non-voters
|
||||
// that are not present in the delegates view of what servers should be available
|
||||
StaleNonVoters []raft.ServerID
|
||||
// StaleVoters are the ids of those servers in the raft configuration as voters that
|
||||
// are not present in the delegates view of what servers should be available
|
||||
StaleVoters []raft.ServerID
|
||||
|
||||
// FailedNonVoters are the servers without voting rights in the cluster that the
|
||||
// delegate has indicated are in a failed state
|
||||
FailedNonVoters []*Server
|
||||
// FailedVoters are the servers without voting rights in the cluster that the
|
||||
// delegate has indicated are in a failed state
|
||||
FailedVoters []*Server
|
||||
}
|
||||
|
||||
// Promoter is an interface to provide promotion/demotion algorithms to the core autopilot type.
|
||||
// The BasicPromoter satisfies this interface and will promote any stable servers but other
|
||||
// algorithms could be implemented. The implementation of these methods shouldn't "block".
|
||||
// While they are synchronous autopilot expects the algorithms to not make any network
|
||||
// or other requests which way cause an indefinite amount of waiting to occur.
|
||||
//
|
||||
// Note that all parameters passed to these functions should be considered read-only and
|
||||
// their modification could result in undefined behavior of the core autopilot routines
|
||||
// including potential crashes.
|
||||
type Promoter interface {
|
||||
// GetServerExt returns some object that should be stored in the Ext field of the Server
|
||||
// This value will not be used by the code in this repo but may be used by the other
|
||||
// Promoter methods and the application utilizing autopilot. If the value returned is
|
||||
// nil the extended state will not be updated.
|
||||
GetServerExt(*Config, *ServerState) interface{}
|
||||
|
||||
// GetStateExt returns some object that should be stored in the Ext field of the State
|
||||
// This value will not be used by the code in this repo but may be used by the other
|
||||
// Promoter methods and the application utilizing autopilot. If the value returned is
|
||||
// nil the extended state will not be updated.
|
||||
GetStateExt(*Config, *State) interface{}
|
||||
|
||||
// GetNodeTypes returns a map of ServerID to NodeType for all the servers which
|
||||
// should have their NodeType field updated
|
||||
GetNodeTypes(*Config, *State) map[raft.ServerID]NodeType
|
||||
|
||||
// CalculatePromotionsAndDemotions
|
||||
CalculatePromotionsAndDemotions(*Config, *State) RaftChanges
|
||||
|
||||
// FilterFailedServerRemovals takes in the current state and structure outlining all the
|
||||
// failed/stale servers and will return those failed servers which the promoter thinks
|
||||
// should be allowed to be removed.
|
||||
FilterFailedServerRemovals(*Config, *State, *FailedServers) *FailedServers
|
||||
}
|
||||
|
||||
// timeProvider is an interface for getting a local time. This is mainly useful for testing
|
||||
// to inject certain times so that output validation is easier.
|
||||
type timeProvider interface {
|
||||
Now() time.Time
|
||||
}
|
||||
|
||||
type runtimeTimeProvider struct{}
|
||||
|
||||
func (_ *runtimeTimeProvider) Now() time.Time {
|
||||
return time.Now()
|
||||
}
|
|
@ -3,6 +3,19 @@
|
|||
IMPROVEMENTS
|
||||
|
||||
* Remove `StartAsLeader` configuration option [[GH-364](https://github.com/hashicorp/raft/pull/386)]
|
||||
* Allow futures to react to `Shutdown()` to prevent a deadlock with `takeSnapshot()` [[GH-390](https://github.com/hashicorp/raft/pull/390)]
|
||||
* Prevent non-voters from becoming eligible for leadership elections [[GH-398](https://github.com/hashicorp/raft/pull/398)]
|
||||
* Remove an unneeded `io.Copy` from snapshot writes [[GH-399](https://github.com/hashicorp/raft/pull/399)]
|
||||
* Log decoded candidate address in `duplicate requestVote` warning [[GH-400](https://github.com/hashicorp/raft/pull/400)]
|
||||
* Prevent starting a TCP transport when IP address is `nil` [[GH-403](https://github.com/hashicorp/raft/pull/403)]
|
||||
* Reject leadership transfer requests when in candidate state to prevent indefinite blocking while unable to elect a leader [[GH-413](https://github.com/hashicorp/raft/pull/413)]
|
||||
* Add labels for metric metadata to reduce cardinality of metric names [[GH-409](https://github.com/hashicorp/raft/pull/409)]
|
||||
* Add peers metric [[GH-413](https://github.com/hashicorp/raft/pull/431)]
|
||||
|
||||
BUG FIXES
|
||||
|
||||
* Make `LeaderCh` always deliver the latest leadership transition [[GH-384](https://github.com/hashicorp/raft/pull/384)]
|
||||
* Handle updating an existing peer in `startStopReplication` [[GH-419](https://github.com/hashicorp/raft/pull/419)]
|
||||
|
||||
# 1.1.2 (January 17th, 2020)
|
||||
|
||||
|
|
|
@ -2,9 +2,16 @@ package api
|
|||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/helper/parseutil"
|
||||
|
||||
"github.com/mitchellh/mapstructure"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/helper/consts"
|
||||
)
|
||||
|
@ -27,6 +34,77 @@ type RaftJoinRequest struct {
|
|||
NonVoter bool `json:"non_voter"`
|
||||
}
|
||||
|
||||
// AutopilotConfig is used for querying/setting the Autopilot configuration.
|
||||
type AutopilotConfig struct {
|
||||
CleanupDeadServers bool `json:"cleanup_dead_servers" mapstructure:"cleanup_dead_servers"`
|
||||
LastContactThreshold time.Duration `json:"last_contact_threshold" mapstructure:"-"`
|
||||
DeadServerLastContactThreshold time.Duration `json:"dead_server_last_contact_threshold" mapstructure:"-"`
|
||||
MaxTrailingLogs uint64 `json:"max_trailing_logs" mapstructure:"max_trailing_logs"`
|
||||
MinQuorum uint `json:"min_quorum" mapstructure:"min_quorum"`
|
||||
ServerStabilizationTime time.Duration `json:"server_stabilization_time" mapstructure:"-"`
|
||||
}
|
||||
|
||||
// UnmarshalJSON parses the autopilot config JSON blob
|
||||
func (ac *AutopilotConfig) UnmarshalJSON(b []byte) error {
|
||||
var data interface{}
|
||||
err := json.Unmarshal(b, &data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
conf := data.(map[string]interface{})
|
||||
if err = mapstructure.WeakDecode(conf, ac); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.LastContactThreshold, err = parseutil.ParseDurationSecond(conf["last_contact_threshold"]); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.DeadServerLastContactThreshold, err = parseutil.ParseDurationSecond(conf["dead_server_last_contact_threshold"]); err != nil {
|
||||
return err
|
||||
}
|
||||
if ac.ServerStabilizationTime, err = parseutil.ParseDurationSecond(conf["server_stabilization_time"]); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AutopilotExecutionStatus represents the current status of the autopilot background go routines
|
||||
type AutopilotExecutionStatus string
|
||||
|
||||
const (
|
||||
AutopilotNotRunning AutopilotExecutionStatus = "not-running"
|
||||
AutopilotRunning AutopilotExecutionStatus = "running"
|
||||
AutopilotShuttingDown AutopilotExecutionStatus = "shutting-down"
|
||||
)
|
||||
|
||||
// AutopilotState represents the response of the raft autopilot state API
|
||||
type AutopilotState struct {
|
||||
ExecutionStatus AutopilotExecutionStatus `mapstructure:"execution_status"`
|
||||
Healthy bool `mapstructure:"healthy"`
|
||||
FailureTolerance int `mapstructure:"failure_tolerance"`
|
||||
OptimisticFailureTolerance int `mapstructure:"optimistic_failure_tolerance"`
|
||||
Servers map[string]*AutopilotServer `mapstructure:"servers"`
|
||||
Leader string `mapstructure:"leader"`
|
||||
Voters []string `mapstructure:"voters"`
|
||||
NonVoters []string `mapstructure:"non_voters"`
|
||||
}
|
||||
|
||||
// AutopilotServer represents the server blocks in the response of the raft
|
||||
// autopilot state API.
|
||||
type AutopilotServer struct {
|
||||
ID string `mapstructure:"id"`
|
||||
Name string `mapstructure:"name"`
|
||||
Address string `mapstructure:"address"`
|
||||
NodeStatus string `mapstructure:"node_status"`
|
||||
LastContact string `mapstructure:"last_contact"`
|
||||
LastTerm uint64 `mapstructure:"last_term"`
|
||||
LastIndex uint64 `mapstructure:"last_index"`
|
||||
Healthy bool `mapstructure:"healthy"`
|
||||
StableSince string `mapstructure:"stable_since"`
|
||||
Status string `mapstructure:"status"`
|
||||
Meta map[string]string `mapstructure:"meta"`
|
||||
}
|
||||
|
||||
// RaftJoin adds the node from which this call is invoked from to the raft
|
||||
// cluster represented by the leader address in the parameter.
|
||||
func (c *Sys) RaftJoin(opts *RaftJoinRequest) (*RaftJoinResponse, error) {
|
||||
|
@ -160,3 +238,79 @@ func (c *Sys) RaftSnapshotRestore(snapReader io.Reader, force bool) error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RaftAutopilotState returns the state of the raft cluster as seen by autopilot.
|
||||
func (c *Sys) RaftAutopilotState() (*AutopilotState, error) {
|
||||
r := c.c.NewRequest("GET", "/v1/sys/storage/raft/autopilot/state")
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
defer cancelFunc()
|
||||
resp, err := c.c.RawRequestWithContext(ctx, r)
|
||||
if resp != nil {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
secret, err := ParseSecret(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if secret == nil || secret.Data == nil {
|
||||
return nil, errors.New("data from server response is empty")
|
||||
}
|
||||
|
||||
var result AutopilotState
|
||||
err = mapstructure.Decode(secret.Data, &result)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, err
|
||||
}
|
||||
|
||||
// RaftAutopilotConfiguration fetches the autopilot config.
|
||||
func (c *Sys) RaftAutopilotConfiguration() (*AutopilotConfig, error) {
|
||||
r := c.c.NewRequest("GET", "/v1/sys/storage/raft/autopilot/configuration")
|
||||
|
||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
defer cancelFunc()
|
||||
resp, err := c.c.RawRequestWithContext(ctx, r)
|
||||
if resp != nil {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
secret, err := ParseSecret(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if secret == nil {
|
||||
return nil, errors.New("data from server response is empty")
|
||||
}
|
||||
|
||||
var result AutopilotConfig
|
||||
if err = mapstructure.Decode(secret.Data, &result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if result.LastContactThreshold, err = parseutil.ParseDurationSecond(secret.Data["last_contact_threshold"]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if result.DeadServerLastContactThreshold, err = parseutil.ParseDurationSecond(secret.Data["dead_server_last_contact_threshold"]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if result.ServerStabilizationTime, err = parseutil.ParseDurationSecond(secret.Data["server_stabilization_time"]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, err
|
||||
}
|
||||
|
|
|
@ -536,8 +536,10 @@ github.com/hashicorp/mdns
|
|||
# github.com/hashicorp/nomad/api v0.0.0-20191220223628-edc62acd919d
|
||||
github.com/hashicorp/nomad/api
|
||||
github.com/hashicorp/nomad/api/contexts
|
||||
# github.com/hashicorp/raft v1.1.3-0.20201002073007-f367681f9c48
|
||||
# github.com/hashicorp/raft v1.2.0
|
||||
github.com/hashicorp/raft
|
||||
# github.com/hashicorp/raft-autopilot v0.1.2
|
||||
github.com/hashicorp/raft-autopilot
|
||||
# github.com/hashicorp/raft-snapshot v1.0.3
|
||||
github.com/hashicorp/raft-snapshot
|
||||
# github.com/hashicorp/serf v0.9.5
|
||||
|
|
Loading…
Reference in New Issue