1695 lines
49 KiB
Go
1695 lines
49 KiB
Go
package raft
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/armon/go-metrics"
|
|
"github.com/golang/protobuf/proto"
|
|
log "github.com/hashicorp/go-hclog"
|
|
wrapping "github.com/hashicorp/go-kms-wrapping"
|
|
"github.com/hashicorp/go-raftchunking"
|
|
"github.com/hashicorp/go-secure-stdlib/tlsutil"
|
|
"github.com/hashicorp/go-uuid"
|
|
"github.com/hashicorp/raft"
|
|
autopilot "github.com/hashicorp/raft-autopilot"
|
|
raftboltdb "github.com/hashicorp/raft-boltdb/v2"
|
|
snapshot "github.com/hashicorp/raft-snapshot"
|
|
"github.com/hashicorp/vault/helper/metricsutil"
|
|
"github.com/hashicorp/vault/sdk/helper/consts"
|
|
"github.com/hashicorp/vault/sdk/helper/jsonutil"
|
|
"github.com/hashicorp/vault/sdk/logical"
|
|
"github.com/hashicorp/vault/sdk/physical"
|
|
"github.com/hashicorp/vault/vault/cluster"
|
|
"github.com/hashicorp/vault/vault/seal"
|
|
bolt "go.etcd.io/bbolt"
|
|
)
|
|
|
|
// EnvVaultRaftNodeID is used to fetch the Raft node ID from the environment.
|
|
const EnvVaultRaftNodeID = "VAULT_RAFT_NODE_ID"
|
|
|
|
// EnvVaultRaftPath is used to fetch the path where Raft data is stored from the environment.
|
|
const EnvVaultRaftPath = "VAULT_RAFT_PATH"
|
|
|
|
var getMmapFlags = func(string) int { return 0 }
|
|
|
|
// Verify RaftBackend satisfies the correct interfaces
|
|
var (
|
|
_ physical.Backend = (*RaftBackend)(nil)
|
|
_ physical.Transactional = (*RaftBackend)(nil)
|
|
_ physical.HABackend = (*RaftBackend)(nil)
|
|
_ physical.Lock = (*RaftLock)(nil)
|
|
)
|
|
|
|
var (
|
|
// raftLogCacheSize is the maximum number of logs to cache in-memory.
|
|
// This is used to reduce disk I/O for the recently committed entries.
|
|
raftLogCacheSize = 512
|
|
|
|
raftState = "raft/"
|
|
peersFileName = "peers.json"
|
|
|
|
restoreOpDelayDuration = 5 * time.Second
|
|
|
|
defaultMaxEntrySize = uint64(2 * raftchunking.ChunkSize)
|
|
)
|
|
|
|
// RaftBackend implements the backend interfaces and uses the raft protocol to
|
|
// persist writes to the FSM.
|
|
type RaftBackend struct {
|
|
logger log.Logger
|
|
conf map[string]string
|
|
l sync.RWMutex
|
|
|
|
// fsm is the state store for vault's data
|
|
fsm *FSM
|
|
|
|
// raft is the instance of raft we will operate on.
|
|
raft *raft.Raft
|
|
|
|
// raftInitCh is used to block during HA lock acquisition if raft
|
|
// has not been initialized yet, which can occur if raft is being
|
|
// used for HA-only.
|
|
raftInitCh chan struct{}
|
|
|
|
// raftNotifyCh is used to receive updates about leadership changes
|
|
// regarding this node.
|
|
raftNotifyCh chan bool
|
|
|
|
// streamLayer is the network layer used to connect the nodes in the raft
|
|
// cluster.
|
|
streamLayer *raftLayer
|
|
|
|
// raftTransport is the transport layer that the raft library uses for RPC
|
|
// communication.
|
|
raftTransport raft.Transport
|
|
|
|
// snapStore is our snapshot mechanism.
|
|
snapStore raft.SnapshotStore
|
|
|
|
// logStore is used by the raft library to store the raft logs in durable
|
|
// storage.
|
|
logStore raft.LogStore
|
|
|
|
// stableStore is used by the raft library to store additional metadata in
|
|
// durable storage.
|
|
stableStore raft.StableStore
|
|
|
|
// bootstrapConfig is only set when this node needs to be bootstrapped upon
|
|
// startup.
|
|
bootstrapConfig *raft.Configuration
|
|
|
|
// dataDir is the location on the local filesystem that raft and FSM data
|
|
// will be stored.
|
|
dataDir string
|
|
|
|
// localID is the ID for this node. This can either be configured in the
|
|
// config file, via a file on disk, or is otherwise randomly generated.
|
|
localID string
|
|
|
|
// serverAddressProvider is used to map server IDs to addresses.
|
|
serverAddressProvider raft.ServerAddressProvider
|
|
|
|
// permitPool is used to limit the number of concurrent storage calls.
|
|
permitPool *physical.PermitPool
|
|
|
|
// maxEntrySize imposes a size limit (in bytes) on a raft entry (put or transaction).
|
|
// It is suggested to use a value of 2x the Raft chunking size for optimal
|
|
// performance.
|
|
maxEntrySize uint64
|
|
|
|
// autopilot is the instance of raft-autopilot library implementation of the
|
|
// autopilot features. This will be instantiated in both leader and followers.
|
|
// However, only active node will have a "running" autopilot.
|
|
autopilot *autopilot.Autopilot
|
|
|
|
// autopilotConfig represents the configuration required to instantiate autopilot.
|
|
autopilotConfig *AutopilotConfig
|
|
|
|
// followerStates represents the information about all the peers of the raft
|
|
// leader. This is used to track some state of the peers and as well as used
|
|
// to see if the peers are "alive" using the heartbeat received from them.
|
|
followerStates *FollowerStates
|
|
|
|
// followerHeartbeatTicker is used to compute dead servers using follower
|
|
// state heartbeats.
|
|
followerHeartbeatTicker *time.Ticker
|
|
|
|
// disableAutopilot if set will not put autopilot implementation to use. The
|
|
// fallback will be to interact with the raft instance directly. This can only
|
|
// be set during startup via the environment variable
|
|
// VAULT_RAFT_AUTOPILOT_DISABLE during startup and can't be updated once the
|
|
// node is up and running.
|
|
disableAutopilot bool
|
|
|
|
autopilotReconcileInterval time.Duration
|
|
}
|
|
|
|
// LeaderJoinInfo contains information required by a node to join itself as a
|
|
// follower to an existing raft cluster
|
|
type LeaderJoinInfo struct {
|
|
// AutoJoin defines any cloud auto-join metadata. If supplied, Vault will
|
|
// attempt to automatically discover peers in addition to what can be provided
|
|
// via 'leader_api_addr'.
|
|
AutoJoin string `json:"auto_join"`
|
|
|
|
// AutoJoinScheme defines the optional URI protocol scheme for addresses
|
|
// discovered via auto-join.
|
|
AutoJoinScheme string `json:"auto_join_scheme"`
|
|
|
|
// AutoJoinPort defines the optional port used for addressed discovered via
|
|
// auto-join.
|
|
AutoJoinPort uint `json:"auto_join_port"`
|
|
|
|
// LeaderAPIAddr is the address of the leader node to connect to
|
|
LeaderAPIAddr string `json:"leader_api_addr"`
|
|
|
|
// LeaderCACert is the CA cert of the leader node
|
|
LeaderCACert string `json:"leader_ca_cert"`
|
|
|
|
// LeaderClientCert is the client certificate for the follower node to
|
|
// establish client authentication during TLS
|
|
LeaderClientCert string `json:"leader_client_cert"`
|
|
|
|
// LeaderClientKey is the client key for the follower node to establish
|
|
// client authentication during TLS.
|
|
LeaderClientKey string `json:"leader_client_key"`
|
|
|
|
// LeaderCACertFile is the path on disk to the the CA cert file of the
|
|
// leader node. This should only be provided via Vault's configuration file.
|
|
LeaderCACertFile string `json:"leader_ca_cert_file"`
|
|
|
|
// LeaderClientCertFile is the path on disk to the client certificate file
|
|
// for the follower node to establish client authentication during TLS. This
|
|
// should only be provided via Vault's configuration file.
|
|
LeaderClientCertFile string `json:"leader_client_cert_file"`
|
|
|
|
// LeaderClientKeyFile is the path on disk to the client key file for the
|
|
// follower node to establish client authentication during TLS. This should
|
|
// only be provided via Vault's configuration file.
|
|
LeaderClientKeyFile string `json:"leader_client_key_file"`
|
|
|
|
// LeaderTLSServerName is the optional ServerName to expect in the leader's
|
|
// certificate, instead of the host/IP we're actually connecting to.
|
|
LeaderTLSServerName string `json:"leader_tls_servername"`
|
|
|
|
// Retry indicates if the join process should automatically be retried
|
|
Retry bool `json:"-"`
|
|
|
|
// TLSConfig for the API client to use when communicating with the leader node
|
|
TLSConfig *tls.Config `json:"-"`
|
|
}
|
|
|
|
// JoinConfig returns a list of information about possible leader nodes that
|
|
// this node can join as a follower
|
|
func (b *RaftBackend) JoinConfig() ([]*LeaderJoinInfo, error) {
|
|
config := b.conf["retry_join"]
|
|
if config == "" {
|
|
return nil, nil
|
|
}
|
|
|
|
var leaderInfos []*LeaderJoinInfo
|
|
err := jsonutil.DecodeJSON([]byte(config), &leaderInfos)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to decode retry_join config: %w", err)
|
|
}
|
|
|
|
if len(leaderInfos) == 0 {
|
|
return nil, errors.New("invalid retry_join config")
|
|
}
|
|
|
|
for i, info := range leaderInfos {
|
|
if len(info.AutoJoin) != 0 && len(info.LeaderAPIAddr) != 0 {
|
|
return nil, errors.New("cannot provide both a leader_api_addr and auto_join")
|
|
}
|
|
|
|
if info.AutoJoinScheme != "" && (info.AutoJoinScheme != "http" && info.AutoJoinScheme != "https") {
|
|
return nil, fmt.Errorf("invalid scheme '%s'; must either be http or https", info.AutoJoinScheme)
|
|
}
|
|
|
|
info.Retry = true
|
|
info.TLSConfig, err = parseTLSInfo(info)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create tls config to communicate with leader node (retry_join index: %d): %w", i, err)
|
|
}
|
|
}
|
|
|
|
return leaderInfos, nil
|
|
}
|
|
|
|
// parseTLSInfo is a helper for parses the TLS information, preferring file
|
|
// paths over raw certificate content.
|
|
func parseTLSInfo(leaderInfo *LeaderJoinInfo) (*tls.Config, error) {
|
|
var tlsConfig *tls.Config
|
|
var err error
|
|
if len(leaderInfo.LeaderCACertFile) != 0 || len(leaderInfo.LeaderClientCertFile) != 0 || len(leaderInfo.LeaderClientKeyFile) != 0 {
|
|
tlsConfig, err = tlsutil.LoadClientTLSConfig(leaderInfo.LeaderCACertFile, leaderInfo.LeaderClientCertFile, leaderInfo.LeaderClientKeyFile)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
} else if len(leaderInfo.LeaderCACert) != 0 || len(leaderInfo.LeaderClientCert) != 0 || len(leaderInfo.LeaderClientKey) != 0 {
|
|
tlsConfig, err = tlsutil.ClientTLSConfig([]byte(leaderInfo.LeaderCACert), []byte(leaderInfo.LeaderClientCert), []byte(leaderInfo.LeaderClientKey))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if tlsConfig != nil {
|
|
tlsConfig.ServerName = leaderInfo.LeaderTLSServerName
|
|
}
|
|
|
|
return tlsConfig, nil
|
|
}
|
|
|
|
// EnsurePath is used to make sure a path exists
|
|
func EnsurePath(path string, dir bool) error {
|
|
if !dir {
|
|
path = filepath.Dir(path)
|
|
}
|
|
return os.MkdirAll(path, 0o700)
|
|
}
|
|
|
|
// NewRaftBackend constructs a RaftBackend using the given directory
|
|
func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend, error) {
|
|
path := os.Getenv(EnvVaultRaftPath)
|
|
if path == "" {
|
|
pathFromConfig, ok := conf["path"]
|
|
if !ok {
|
|
return nil, fmt.Errorf("'path' must be set")
|
|
}
|
|
path = pathFromConfig
|
|
}
|
|
|
|
var localID string
|
|
{
|
|
// Determine the local node ID from the environment.
|
|
if raftNodeID := os.Getenv(EnvVaultRaftNodeID); raftNodeID != "" {
|
|
localID = raftNodeID
|
|
}
|
|
|
|
// If not set in the environment check the configuration file.
|
|
if len(localID) == 0 {
|
|
localID = conf["node_id"]
|
|
}
|
|
|
|
// If not set in the config check the "node-id" file.
|
|
if len(localID) == 0 {
|
|
localIDRaw, err := ioutil.ReadFile(filepath.Join(path, "node-id"))
|
|
switch {
|
|
case err == nil:
|
|
if len(localIDRaw) > 0 {
|
|
localID = string(localIDRaw)
|
|
}
|
|
case os.IsNotExist(err):
|
|
default:
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// If all of the above fails generate a UUID and persist it to the
|
|
// "node-id" file.
|
|
if len(localID) == 0 {
|
|
id, err := uuid.GenerateUUID()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := ioutil.WriteFile(filepath.Join(path, "node-id"), []byte(id), 0o600); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
localID = id
|
|
}
|
|
}
|
|
|
|
// Create the FSM.
|
|
fsm, err := NewFSM(path, localID, logger.Named("fsm"))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create fsm: %v", err)
|
|
}
|
|
|
|
if delayRaw, ok := conf["apply_delay"]; ok {
|
|
delay, err := time.ParseDuration(delayRaw)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("apply_delay does not parse as a duration: %w", err)
|
|
}
|
|
fsm.applyCallback = func() {
|
|
time.Sleep(delay)
|
|
}
|
|
}
|
|
|
|
// Build an all in-memory setup for dev mode, otherwise prepare a full
|
|
// disk-based setup.
|
|
var log raft.LogStore
|
|
var stable raft.StableStore
|
|
var snap raft.SnapshotStore
|
|
|
|
var devMode bool
|
|
if devMode {
|
|
store := raft.NewInmemStore()
|
|
stable = store
|
|
log = store
|
|
snap = raft.NewInmemSnapshotStore()
|
|
} else {
|
|
// Create the base raft path.
|
|
path := filepath.Join(path, raftState)
|
|
if err := EnsurePath(path, true); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create the backend raft store for logs and stable storage.
|
|
dbPath := filepath.Join(path, "raft.db")
|
|
opts := boltOptions(dbPath)
|
|
raftOptions := raftboltdb.Options{
|
|
Path: dbPath,
|
|
BoltOptions: opts,
|
|
}
|
|
store, err := raftboltdb.New(raftOptions)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
stable = store
|
|
|
|
// Wrap the store in a LogCache to improve performance.
|
|
cacheStore, err := raft.NewLogCache(raftLogCacheSize, store)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
log = cacheStore
|
|
|
|
// Create the snapshot store.
|
|
snapshots, err := NewBoltSnapshotStore(path, logger.Named("snapshot"), fsm)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
snap = snapshots
|
|
}
|
|
|
|
if delayRaw, ok := conf["snapshot_delay"]; ok {
|
|
delay, err := time.ParseDuration(delayRaw)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("snapshot_delay does not parse as a duration: %w", err)
|
|
}
|
|
snap = newSnapshotStoreDelay(snap, delay, logger)
|
|
}
|
|
|
|
maxEntrySize := defaultMaxEntrySize
|
|
if maxEntrySizeCfg := conf["max_entry_size"]; len(maxEntrySizeCfg) != 0 {
|
|
i, err := strconv.Atoi(maxEntrySizeCfg)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse 'max_entry_size': %w", err)
|
|
}
|
|
|
|
maxEntrySize = uint64(i)
|
|
}
|
|
|
|
var reconcileInterval time.Duration
|
|
if interval := conf["autopilot_reconcile_interval"]; interval != "" {
|
|
interval, err := time.ParseDuration(interval)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("autopilot_reconcile_interval does not parse as a duration: %w", err)
|
|
}
|
|
reconcileInterval = interval
|
|
}
|
|
|
|
return &RaftBackend{
|
|
logger: logger,
|
|
fsm: fsm,
|
|
raftInitCh: make(chan struct{}),
|
|
conf: conf,
|
|
logStore: log,
|
|
stableStore: stable,
|
|
snapStore: snap,
|
|
dataDir: path,
|
|
localID: localID,
|
|
permitPool: physical.NewPermitPool(physical.DefaultParallelOperations),
|
|
maxEntrySize: maxEntrySize,
|
|
followerHeartbeatTicker: time.NewTicker(time.Second),
|
|
autopilotReconcileInterval: reconcileInterval,
|
|
}, nil
|
|
}
|
|
|
|
type snapshotStoreDelay struct {
|
|
logger log.Logger
|
|
wrapped raft.SnapshotStore
|
|
delay time.Duration
|
|
}
|
|
|
|
func (s snapshotStoreDelay) Create(version raft.SnapshotVersion, index, term uint64, configuration raft.Configuration, configurationIndex uint64, trans raft.Transport) (raft.SnapshotSink, error) {
|
|
s.logger.Trace("delaying before creating snapshot", "delay", s.delay)
|
|
time.Sleep(s.delay)
|
|
return s.wrapped.Create(version, index, term, configuration, configurationIndex, trans)
|
|
}
|
|
|
|
func (s snapshotStoreDelay) List() ([]*raft.SnapshotMeta, error) {
|
|
return s.wrapped.List()
|
|
}
|
|
|
|
func (s snapshotStoreDelay) Open(id string) (*raft.SnapshotMeta, io.ReadCloser, error) {
|
|
return s.wrapped.Open(id)
|
|
}
|
|
|
|
var _ raft.SnapshotStore = &snapshotStoreDelay{}
|
|
|
|
func newSnapshotStoreDelay(snap raft.SnapshotStore, delay time.Duration, logger log.Logger) *snapshotStoreDelay {
|
|
return &snapshotStoreDelay{
|
|
logger: logger,
|
|
wrapped: snap,
|
|
delay: delay,
|
|
}
|
|
}
|
|
|
|
// Close is used to gracefully close all file resources. N.B. This method
|
|
// should only be called if you are sure the RaftBackend will never be used
|
|
// again.
|
|
func (b *RaftBackend) Close() error {
|
|
b.l.Lock()
|
|
defer b.l.Unlock()
|
|
|
|
if err := b.fsm.db.Close(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := b.stableStore.(*raftboltdb.BoltStore).Close(); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b *RaftBackend) CollectMetrics(sink *metricsutil.ClusterMetricSink) {
|
|
b.l.RLock()
|
|
logstoreStats := b.stableStore.(*raftboltdb.BoltStore).Stats()
|
|
fsmStats := b.fsm.db.Stats()
|
|
b.l.RUnlock()
|
|
b.collectMetricsWithStats(logstoreStats, sink, "logstore")
|
|
b.collectMetricsWithStats(fsmStats, sink, "fsm")
|
|
}
|
|
|
|
func (b *RaftBackend) collectMetricsWithStats(stats bolt.Stats, sink *metricsutil.ClusterMetricSink, database string) {
|
|
txstats := stats.TxStats
|
|
labels := []metricsutil.Label{{"database", database}}
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "freelist", "free_pages"}, float32(stats.FreePageN), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "freelist", "pending_pages"}, float32(stats.PendingPageN), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "freelist", "allocated_bytes"}, float32(stats.FreeAlloc), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "freelist", "used_bytes"}, float32(stats.FreelistInuse), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "transaction", "started_read_transactions"}, float32(stats.TxN), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "transaction", "currently_open_read_transactions"}, float32(stats.OpenTxN), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "page", "count"}, float32(txstats.PageCount), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "page", "bytes_allocated"}, float32(txstats.PageAlloc), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "cursor", "count"}, float32(txstats.CursorCount), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "node", "count"}, float32(txstats.NodeCount), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "node", "dereferences"}, float32(txstats.NodeDeref), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "rebalance", "count"}, float32(txstats.Rebalance), labels)
|
|
sink.AddSampleWithLabels([]string{"raft_storage", "bolt", "rebalance", "time"}, float32(txstats.RebalanceTime.Milliseconds()), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "split", "count"}, float32(txstats.Split), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "spill", "count"}, float32(txstats.Spill), labels)
|
|
sink.AddSampleWithLabels([]string{"raft_storage", "bolt", "spill", "time"}, float32(txstats.SpillTime.Milliseconds()), labels)
|
|
sink.SetGaugeWithLabels([]string{"raft_storage", "bolt", "write", "count"}, float32(txstats.Write), labels)
|
|
sink.AddSampleWithLabels([]string{"raft_storage", "bolt", "write", "time"}, float32(txstats.WriteTime.Milliseconds()), labels)
|
|
}
|
|
|
|
// RaftServer has information about a server in the Raft configuration
|
|
type RaftServer struct {
|
|
// NodeID is the name of the server
|
|
NodeID string `json:"node_id"`
|
|
|
|
// Address is the IP:port of the server, used for Raft communications
|
|
Address string `json:"address"`
|
|
|
|
// Leader is true if this server is the current cluster leader
|
|
Leader bool `json:"leader"`
|
|
|
|
// Protocol version is the raft protocol version used by the server
|
|
ProtocolVersion string `json:"protocol_version"`
|
|
|
|
// Voter is true if this server has a vote in the cluster. This might
|
|
// be false if the server is staging and still coming online.
|
|
Voter bool `json:"voter"`
|
|
}
|
|
|
|
// RaftConfigurationResponse is returned when querying for the current Raft
|
|
// configuration.
|
|
type RaftConfigurationResponse struct {
|
|
// Servers has the list of servers in the Raft configuration.
|
|
Servers []*RaftServer `json:"servers"`
|
|
|
|
// Index has the Raft index of this configuration.
|
|
Index uint64 `json:"index"`
|
|
}
|
|
|
|
// Peer defines the ID and Address for a given member of the raft cluster.
|
|
type Peer struct {
|
|
ID string `json:"id"`
|
|
Address string `json:"address"`
|
|
Suffrage int `json:"suffrage"`
|
|
}
|
|
|
|
// NodeID returns the identifier of the node
|
|
func (b *RaftBackend) NodeID() string {
|
|
return b.localID
|
|
}
|
|
|
|
// Initialized tells if raft is running or not
|
|
func (b *RaftBackend) Initialized() bool {
|
|
b.l.RLock()
|
|
init := b.raft != nil
|
|
b.l.RUnlock()
|
|
return init
|
|
}
|
|
|
|
// SetTLSKeyring is used to install a new keyring. If the active key has changed
|
|
// it will also close any network connections or streams forcing a reconnect
|
|
// with the new key.
|
|
func (b *RaftBackend) SetTLSKeyring(keyring *TLSKeyring) error {
|
|
b.l.RLock()
|
|
err := b.streamLayer.setTLSKeyring(keyring)
|
|
b.l.RUnlock()
|
|
|
|
return err
|
|
}
|
|
|
|
// SetServerAddressProvider sets a the address provider for determining the raft
|
|
// node addresses. This is currently only used in tests.
|
|
func (b *RaftBackend) SetServerAddressProvider(provider raft.ServerAddressProvider) {
|
|
b.l.Lock()
|
|
b.serverAddressProvider = provider
|
|
b.l.Unlock()
|
|
}
|
|
|
|
// Bootstrap prepares the given peers to be part of the raft cluster
|
|
func (b *RaftBackend) Bootstrap(peers []Peer) error {
|
|
b.l.Lock()
|
|
defer b.l.Unlock()
|
|
|
|
hasState, err := raft.HasExistingState(b.logStore, b.stableStore, b.snapStore)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if hasState {
|
|
return errors.New("error bootstrapping cluster: cluster already has state")
|
|
}
|
|
|
|
raftConfig := &raft.Configuration{
|
|
Servers: make([]raft.Server, len(peers)),
|
|
}
|
|
|
|
for i, p := range peers {
|
|
raftConfig.Servers[i] = raft.Server{
|
|
ID: raft.ServerID(p.ID),
|
|
Address: raft.ServerAddress(p.Address),
|
|
Suffrage: raft.ServerSuffrage(p.Suffrage),
|
|
}
|
|
}
|
|
|
|
// Store the config for later use
|
|
b.bootstrapConfig = raftConfig
|
|
return nil
|
|
}
|
|
|
|
// SetRestoreCallback sets the callback to be used when a restoreCallbackOp is
|
|
// processed through the FSM.
|
|
func (b *RaftBackend) SetRestoreCallback(restoreCb restoreCallback) {
|
|
b.fsm.l.Lock()
|
|
b.fsm.restoreCb = restoreCb
|
|
b.fsm.l.Unlock()
|
|
}
|
|
|
|
func (b *RaftBackend) applyConfigSettings(config *raft.Config) error {
|
|
config.Logger = b.logger
|
|
multiplierRaw, ok := b.conf["performance_multiplier"]
|
|
multiplier := 5
|
|
if ok {
|
|
var err error
|
|
multiplier, err = strconv.Atoi(multiplierRaw)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
config.ElectionTimeout = config.ElectionTimeout * time.Duration(multiplier)
|
|
config.HeartbeatTimeout = config.HeartbeatTimeout * time.Duration(multiplier)
|
|
config.LeaderLeaseTimeout = config.LeaderLeaseTimeout * time.Duration(multiplier)
|
|
|
|
snapThresholdRaw, ok := b.conf["snapshot_threshold"]
|
|
if ok {
|
|
var err error
|
|
snapThreshold, err := strconv.Atoi(snapThresholdRaw)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
config.SnapshotThreshold = uint64(snapThreshold)
|
|
}
|
|
|
|
trailingLogsRaw, ok := b.conf["trailing_logs"]
|
|
if ok {
|
|
var err error
|
|
trailingLogs, err := strconv.Atoi(trailingLogsRaw)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
config.TrailingLogs = uint64(trailingLogs)
|
|
}
|
|
snapshotIntervalRaw, ok := b.conf["snapshot_interval"]
|
|
if ok {
|
|
var err error
|
|
snapshotInterval, err := time.ParseDuration(snapshotIntervalRaw)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
config.SnapshotInterval = snapshotInterval
|
|
}
|
|
|
|
config.NoSnapshotRestoreOnStart = true
|
|
config.MaxAppendEntries = 64
|
|
|
|
// Setting BatchApplyCh allows the raft library to enqueue up to
|
|
// MaxAppendEntries into each raft apply rather than relying on the
|
|
// scheduler.
|
|
config.BatchApplyCh = true
|
|
|
|
return nil
|
|
}
|
|
|
|
// SetupOpts are used to pass options to the raft setup function.
|
|
type SetupOpts struct {
|
|
// TLSKeyring is the keyring to use for the cluster traffic.
|
|
TLSKeyring *TLSKeyring
|
|
|
|
// ClusterListener is the cluster hook used to register the raft handler and
|
|
// client with core's cluster listeners.
|
|
ClusterListener cluster.ClusterHook
|
|
|
|
// StartAsLeader is used to specify this node should start as leader and
|
|
// bypass the leader election. This should be used with caution.
|
|
StartAsLeader bool
|
|
|
|
// RecoveryModeConfig is the configuration for the raft cluster in recovery
|
|
// mode.
|
|
RecoveryModeConfig *raft.Configuration
|
|
}
|
|
|
|
func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
|
|
recoveryModeConfig := &raft.Configuration{
|
|
Servers: []raft.Server{
|
|
{
|
|
ID: raft.ServerID(peer.ID),
|
|
Address: raft.ServerAddress(peer.Address),
|
|
},
|
|
},
|
|
}
|
|
|
|
return b.SetupCluster(context.Background(), SetupOpts{
|
|
StartAsLeader: true,
|
|
RecoveryModeConfig: recoveryModeConfig,
|
|
})
|
|
}
|
|
|
|
func (b *RaftBackend) HasState() (bool, error) {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
return raft.HasExistingState(b.logStore, b.stableStore, b.snapStore)
|
|
}
|
|
|
|
// SetupCluster starts the raft cluster and enables the networking needed for
|
|
// the raft nodes to communicate.
|
|
func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
|
|
b.logger.Trace("setting up raft cluster")
|
|
|
|
b.l.Lock()
|
|
defer b.l.Unlock()
|
|
|
|
// We are already unsealed
|
|
if b.raft != nil {
|
|
b.logger.Debug("raft already started, not setting up cluster")
|
|
return nil
|
|
}
|
|
|
|
if len(b.localID) == 0 {
|
|
return errors.New("no local node id configured")
|
|
}
|
|
|
|
// Setup the raft config
|
|
raftConfig := raft.DefaultConfig()
|
|
if err := b.applyConfigSettings(raftConfig); err != nil {
|
|
return err
|
|
}
|
|
|
|
listenerIsNil := func(cl cluster.ClusterHook) bool {
|
|
switch {
|
|
case opts.ClusterListener == nil:
|
|
return true
|
|
default:
|
|
// Concrete type checks
|
|
switch cl.(type) {
|
|
case *cluster.Listener:
|
|
return cl.(*cluster.Listener) == nil
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
switch {
|
|
case opts.TLSKeyring == nil && listenerIsNil(opts.ClusterListener):
|
|
// If we don't have a provided network we use an in-memory one.
|
|
// This allows us to bootstrap a node without bringing up a cluster
|
|
// network. This will be true during bootstrap, tests and dev modes.
|
|
_, b.raftTransport = raft.NewInmemTransportWithTimeout(raft.ServerAddress(b.localID), time.Second)
|
|
case opts.TLSKeyring == nil:
|
|
return errors.New("no keyring provided")
|
|
case listenerIsNil(opts.ClusterListener):
|
|
return errors.New("no cluster listener provided")
|
|
default:
|
|
// Set the local address and localID in the streaming layer and the raft config.
|
|
streamLayer, err := NewRaftLayer(b.logger.Named("stream"), opts.TLSKeyring, opts.ClusterListener)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
transConfig := &raft.NetworkTransportConfig{
|
|
Stream: streamLayer,
|
|
MaxPool: 3,
|
|
Timeout: 10 * time.Second,
|
|
ServerAddressProvider: b.serverAddressProvider,
|
|
Logger: b.logger.Named("raft-net"),
|
|
}
|
|
transport := raft.NewNetworkTransportWithConfig(transConfig)
|
|
|
|
b.streamLayer = streamLayer
|
|
b.raftTransport = transport
|
|
}
|
|
|
|
raftConfig.LocalID = raft.ServerID(b.localID)
|
|
|
|
// Set up a channel for reliable leader notifications.
|
|
raftNotifyCh := make(chan bool, 10)
|
|
raftConfig.NotifyCh = raftNotifyCh
|
|
|
|
// If we have a bootstrapConfig set we should bootstrap now.
|
|
if b.bootstrapConfig != nil {
|
|
bootstrapConfig := b.bootstrapConfig
|
|
// Unset the bootstrap config
|
|
b.bootstrapConfig = nil
|
|
|
|
// Bootstrap raft with our known cluster members.
|
|
if err := raft.BootstrapCluster(raftConfig, b.logStore, b.stableStore, b.snapStore, b.raftTransport, *bootstrapConfig); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Setup the Raft store.
|
|
b.fsm.SetNoopRestore(true)
|
|
|
|
raftPath := filepath.Join(b.dataDir, raftState)
|
|
peersFile := filepath.Join(raftPath, peersFileName)
|
|
_, err := os.Stat(peersFile)
|
|
if err == nil {
|
|
b.logger.Info("raft recovery initiated", "recovery_file", peersFileName)
|
|
|
|
recoveryConfig, err := raft.ReadConfigJSON(peersFile)
|
|
if err != nil {
|
|
return fmt.Errorf("raft recovery failed to parse peers.json: %w", err)
|
|
}
|
|
|
|
// Non-voting servers are only allowed in enterprise. If Suffrage is disabled,
|
|
// error out to indicate that it isn't allowed.
|
|
for idx := range recoveryConfig.Servers {
|
|
if !nonVotersAllowed && recoveryConfig.Servers[idx].Suffrage == raft.Nonvoter {
|
|
return fmt.Errorf("raft recovery failed to parse configuration for node %q: setting `non_voter` is only supported in enterprise", recoveryConfig.Servers[idx].ID)
|
|
}
|
|
}
|
|
|
|
b.logger.Info("raft recovery found new config", "config", recoveryConfig)
|
|
|
|
err = raft.RecoverCluster(raftConfig, b.fsm, b.logStore, b.stableStore, b.snapStore, b.raftTransport, recoveryConfig)
|
|
if err != nil {
|
|
return fmt.Errorf("raft recovery failed: %w", err)
|
|
}
|
|
|
|
err = os.Remove(peersFile)
|
|
if err != nil {
|
|
return fmt.Errorf("raft recovery failed to delete peers.json; please delete manually: %w", err)
|
|
}
|
|
b.logger.Info("raft recovery deleted peers.json")
|
|
}
|
|
|
|
if opts.RecoveryModeConfig != nil {
|
|
err = raft.RecoverCluster(raftConfig, b.fsm, b.logStore, b.stableStore, b.snapStore, b.raftTransport, *opts.RecoveryModeConfig)
|
|
if err != nil {
|
|
return fmt.Errorf("recovering raft cluster failed: %w", err)
|
|
}
|
|
}
|
|
|
|
b.logger.Info("creating Raft", "config", fmt.Sprintf("%#v", raftConfig))
|
|
raftObj, err := raft.NewRaft(raftConfig, b.fsm.chunker, b.logStore, b.stableStore, b.snapStore, b.raftTransport)
|
|
b.fsm.SetNoopRestore(false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// If we are expecting to start as leader wait until we win the election.
|
|
// This should happen quickly since there is only one node in the cluster.
|
|
// StartAsLeader is only set during init, recovery mode, storage migration,
|
|
// and tests.
|
|
if opts.StartAsLeader {
|
|
// ticker is used to prevent memory leak of using time.After in
|
|
// for - select pattern.
|
|
ticker := time.NewTicker(10 * time.Millisecond)
|
|
defer ticker.Stop()
|
|
for {
|
|
if raftObj.State() == raft.Leader {
|
|
break
|
|
}
|
|
|
|
ticker.Reset(10 * time.Millisecond)
|
|
select {
|
|
case <-ctx.Done():
|
|
future := raftObj.Shutdown()
|
|
if future.Error() != nil {
|
|
return fmt.Errorf("shutdown while waiting for leadership: %w", future.Error())
|
|
}
|
|
|
|
return errors.New("shutdown while waiting for leadership")
|
|
case <-ticker.C:
|
|
}
|
|
}
|
|
}
|
|
|
|
b.raft = raftObj
|
|
b.raftNotifyCh = raftNotifyCh
|
|
|
|
if err := b.fsm.upgradeLocalNodeConfig(); err != nil {
|
|
b.logger.Error("failed to upgrade local node configuration")
|
|
return err
|
|
}
|
|
|
|
if b.streamLayer != nil {
|
|
// Add Handler to the cluster.
|
|
opts.ClusterListener.AddHandler(consts.RaftStorageALPN, b.streamLayer)
|
|
|
|
// Add Client to the cluster.
|
|
opts.ClusterListener.AddClient(consts.RaftStorageALPN, b.streamLayer)
|
|
}
|
|
|
|
// Close the init channel to signal setup has been completed
|
|
close(b.raftInitCh)
|
|
|
|
b.logger.Trace("finished setting up raft cluster")
|
|
return nil
|
|
}
|
|
|
|
// TeardownCluster shuts down the raft cluster
|
|
func (b *RaftBackend) TeardownCluster(clusterListener cluster.ClusterHook) error {
|
|
if clusterListener != nil {
|
|
clusterListener.StopHandler(consts.RaftStorageALPN)
|
|
clusterListener.RemoveClient(consts.RaftStorageALPN)
|
|
}
|
|
|
|
b.l.Lock()
|
|
|
|
// Perform shutdown only if the raft object is non-nil. The object could be nil
|
|
// if the node is unsealed but has not joined the peer set.
|
|
var future raft.Future
|
|
if b.raft != nil {
|
|
future = b.raft.Shutdown()
|
|
}
|
|
|
|
b.raft = nil
|
|
|
|
// If we're tearing down, then we need to recreate the raftInitCh
|
|
b.raftInitCh = make(chan struct{})
|
|
b.l.Unlock()
|
|
|
|
if future != nil {
|
|
return future.Error()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// CommittedIndex returns the latest index committed to stable storage
|
|
func (b *RaftBackend) CommittedIndex() uint64 {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.raft == nil {
|
|
return 0
|
|
}
|
|
|
|
return b.raft.LastIndex()
|
|
}
|
|
|
|
// AppliedIndex returns the latest index applied to the FSM
|
|
func (b *RaftBackend) AppliedIndex() uint64 {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.fsm == nil {
|
|
return 0
|
|
}
|
|
|
|
// We use the latest index that the FSM has seen here, which may be behind
|
|
// raft.AppliedIndex() due to the async nature of the raft library.
|
|
indexState, _ := b.fsm.LatestState()
|
|
return indexState.Index
|
|
}
|
|
|
|
// Term returns the raft term of this node.
|
|
func (b *RaftBackend) Term() uint64 {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.fsm == nil {
|
|
return 0
|
|
}
|
|
|
|
// We use the latest index that the FSM has seen here, which may be behind
|
|
// raft.AppliedIndex() due to the async nature of the raft library.
|
|
indexState, _ := b.fsm.LatestState()
|
|
return indexState.Term
|
|
}
|
|
|
|
// RemovePeer removes the given peer ID from the raft cluster. If the node is
|
|
// ourselves we will give up leadership.
|
|
func (b *RaftBackend) RemovePeer(ctx context.Context, peerID string) error {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if b.disableAutopilot {
|
|
if b.raft == nil {
|
|
return errors.New("raft storage is not initialized")
|
|
}
|
|
b.logger.Trace("removing server from raft", "id", peerID)
|
|
future := b.raft.RemoveServer(raft.ServerID(peerID), 0, 0)
|
|
return future.Error()
|
|
}
|
|
|
|
if b.autopilot == nil {
|
|
return errors.New("raft storage autopilot is not initialized")
|
|
}
|
|
|
|
b.logger.Trace("removing server from raft via autopilot", "id", peerID)
|
|
return b.autopilot.RemoveServer(raft.ServerID(peerID))
|
|
}
|
|
|
|
// GetConfigurationOffline is used to read the stale, last known raft
|
|
// configuration to this node. It accesses the last state written into the
|
|
// FSM. When a server is online use GetConfiguration instead.
|
|
func (b *RaftBackend) GetConfigurationOffline() (*RaftConfigurationResponse, error) {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.raft != nil {
|
|
return nil, errors.New("raft storage is initialized, used GetConfiguration instead")
|
|
}
|
|
|
|
if b.fsm == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
state, configuration := b.fsm.LatestState()
|
|
config := &RaftConfigurationResponse{
|
|
Index: state.Index,
|
|
}
|
|
|
|
if configuration == nil || configuration.Servers == nil {
|
|
return config, nil
|
|
}
|
|
|
|
for _, server := range configuration.Servers {
|
|
entry := &RaftServer{
|
|
NodeID: server.Id,
|
|
Address: server.Address,
|
|
// Since we are offline no node is the leader.
|
|
Leader: false,
|
|
Voter: raft.ServerSuffrage(server.Suffrage) == raft.Voter,
|
|
}
|
|
config.Servers = append(config.Servers, entry)
|
|
}
|
|
|
|
return config, nil
|
|
}
|
|
|
|
func (b *RaftBackend) GetConfiguration(ctx context.Context) (*RaftConfigurationResponse, error) {
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.raft == nil {
|
|
return nil, errors.New("raft storage is not initialized")
|
|
}
|
|
|
|
future := b.raft.GetConfiguration()
|
|
if err := future.Error(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
config := &RaftConfigurationResponse{
|
|
Index: future.Index(),
|
|
}
|
|
|
|
for _, server := range future.Configuration().Servers {
|
|
entry := &RaftServer{
|
|
NodeID: string(server.ID),
|
|
Address: string(server.Address),
|
|
// Since we only service this request on the active node our node ID
|
|
// denotes the raft leader.
|
|
Leader: string(server.ID) == b.NodeID(),
|
|
Voter: server.Suffrage == raft.Voter,
|
|
ProtocolVersion: strconv.Itoa(raft.ProtocolVersionMax),
|
|
}
|
|
config.Servers = append(config.Servers, entry)
|
|
}
|
|
|
|
return config, nil
|
|
}
|
|
|
|
// AddPeer adds a new server to the raft cluster
|
|
func (b *RaftBackend) AddPeer(ctx context.Context, peerID, clusterAddr string) error {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.disableAutopilot {
|
|
if b.raft == nil {
|
|
return errors.New("raft storage is not initialized")
|
|
}
|
|
b.logger.Trace("adding server to raft", "id", peerID)
|
|
future := b.raft.AddVoter(raft.ServerID(peerID), raft.ServerAddress(clusterAddr), 0, 0)
|
|
return future.Error()
|
|
}
|
|
|
|
if b.autopilot == nil {
|
|
return errors.New("raft storage autopilot is not initialized")
|
|
}
|
|
|
|
b.logger.Trace("adding server to raft via autopilot", "id", peerID)
|
|
return b.autopilot.AddServer(&autopilot.Server{
|
|
ID: raft.ServerID(peerID),
|
|
Name: peerID,
|
|
Address: raft.ServerAddress(clusterAddr),
|
|
RaftVersion: raft.ProtocolVersionMax,
|
|
NodeType: autopilot.NodeVoter,
|
|
})
|
|
}
|
|
|
|
// Peers returns all the servers present in the raft cluster
|
|
func (b *RaftBackend) Peers(ctx context.Context) ([]Peer, error) {
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.raft == nil {
|
|
return nil, errors.New("raft storage is not initialized")
|
|
}
|
|
|
|
future := b.raft.GetConfiguration()
|
|
if err := future.Error(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ret := make([]Peer, len(future.Configuration().Servers))
|
|
for i, s := range future.Configuration().Servers {
|
|
ret[i] = Peer{
|
|
ID: string(s.ID),
|
|
Address: string(s.Address),
|
|
Suffrage: int(s.Suffrage),
|
|
}
|
|
}
|
|
|
|
return ret, nil
|
|
}
|
|
|
|
// SnapshotHTTP is a wrapper for Snapshot that sends the snapshot as an HTTP
|
|
// response.
|
|
func (b *RaftBackend) SnapshotHTTP(out *logical.HTTPResponseWriter, access *seal.Access) error {
|
|
out.Header().Add("Content-Disposition", "attachment")
|
|
out.Header().Add("Content-Type", "application/gzip")
|
|
|
|
return b.Snapshot(out, access)
|
|
}
|
|
|
|
// Snapshot takes a raft snapshot, packages it into a archive file and writes it
|
|
// to the provided writer. Seal access is used to encrypt the SHASUM file so we
|
|
// can validate the snapshot was taken using the same root keys or not.
|
|
func (b *RaftBackend) Snapshot(out io.Writer, access *seal.Access) error {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.raft == nil {
|
|
return errors.New("raft storage is sealed")
|
|
}
|
|
|
|
// If we have access to the seal create a sealer object
|
|
var s snapshot.Sealer
|
|
if access != nil {
|
|
s = &sealer{
|
|
access: access,
|
|
}
|
|
}
|
|
|
|
return snapshot.Write(b.logger.Named("snapshot"), b.raft, s, out)
|
|
}
|
|
|
|
// WriteSnapshotToTemp reads a snapshot archive off the provided reader,
|
|
// extracts the data and writes the snapshot to a temporary file. The seal
|
|
// access is used to decrypt the SHASUM file in the archive to ensure this
|
|
// snapshot has the same root key as the running instance. If the provided
|
|
// access is nil then it will skip that validation.
|
|
func (b *RaftBackend) WriteSnapshotToTemp(in io.ReadCloser, access *seal.Access) (*os.File, func(), raft.SnapshotMeta, error) {
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
var metadata raft.SnapshotMeta
|
|
if b.raft == nil {
|
|
return nil, nil, metadata, errors.New("raft storage is sealed")
|
|
}
|
|
|
|
// If we have access to the seal create a sealer object
|
|
var s snapshot.Sealer
|
|
if access != nil {
|
|
s = &sealer{
|
|
access: access,
|
|
}
|
|
}
|
|
|
|
snap, cleanup, err := snapshot.WriteToTempFileWithSealer(b.logger.Named("snapshot"), in, &metadata, s)
|
|
return snap, cleanup, metadata, err
|
|
}
|
|
|
|
// RestoreSnapshot applies the provided snapshot metadata and snapshot data to
|
|
// raft.
|
|
func (b *RaftBackend) RestoreSnapshot(ctx context.Context, metadata raft.SnapshotMeta, snap io.Reader) error {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
b.l.RLock()
|
|
defer b.l.RUnlock()
|
|
|
|
if b.raft == nil {
|
|
return errors.New("raft storage is not initialized")
|
|
}
|
|
|
|
if err := b.raft.Restore(&metadata, snap, 0); err != nil {
|
|
b.logger.Named("snapshot").Error("failed to restore snapshot", "error", err)
|
|
return err
|
|
}
|
|
|
|
// Apply a log that tells the follower nodes to run the restore callback
|
|
// function. This is done after the restore call so we can be sure the
|
|
// snapshot applied to a quorum of nodes.
|
|
command := &LogData{
|
|
Operations: []*LogOperation{
|
|
{
|
|
OpType: restoreCallbackOp,
|
|
},
|
|
},
|
|
}
|
|
|
|
err := b.applyLog(ctx, command)
|
|
|
|
// Do a best-effort attempt to let the standbys apply the restoreCallbackOp
|
|
// before we continue.
|
|
time.Sleep(restoreOpDelayDuration)
|
|
return err
|
|
}
|
|
|
|
// Delete inserts an entry in the log to delete the given path
|
|
func (b *RaftBackend) Delete(ctx context.Context, path string) error {
|
|
defer metrics.MeasureSince([]string{"raft-storage", "delete"}, time.Now())
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
command := &LogData{
|
|
Operations: []*LogOperation{
|
|
{
|
|
OpType: deleteOp,
|
|
Key: path,
|
|
},
|
|
},
|
|
}
|
|
b.permitPool.Acquire()
|
|
defer b.permitPool.Release()
|
|
|
|
b.l.RLock()
|
|
err := b.applyLog(ctx, command)
|
|
b.l.RUnlock()
|
|
return err
|
|
}
|
|
|
|
// Get returns the value corresponding to the given path from the fsm
|
|
func (b *RaftBackend) Get(ctx context.Context, path string) (*physical.Entry, error) {
|
|
defer metrics.MeasureSince([]string{"raft-storage", "get"}, time.Now())
|
|
if b.fsm == nil {
|
|
return nil, errors.New("raft: fsm not configured")
|
|
}
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
b.permitPool.Acquire()
|
|
defer b.permitPool.Release()
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
entry, err := b.fsm.Get(ctx, path)
|
|
if entry != nil {
|
|
valueLen := len(entry.Value)
|
|
if uint64(valueLen) > b.maxEntrySize {
|
|
b.logger.Warn("retrieved entry value is too large, has raft's max_entry_size been reduced?",
|
|
"size", valueLen, "max_entry_size", b.maxEntrySize)
|
|
}
|
|
}
|
|
|
|
return entry, err
|
|
}
|
|
|
|
// Put inserts an entry in the log for the put operation. It will return an
|
|
// error if the resulting entry encoding exceeds the configured max_entry_size
|
|
// or if the call to applyLog fails.
|
|
func (b *RaftBackend) Put(ctx context.Context, entry *physical.Entry) error {
|
|
defer metrics.MeasureSince([]string{"raft-storage", "put"}, time.Now())
|
|
if len(entry.Key) > bolt.MaxKeySize {
|
|
return fmt.Errorf("%s, max key size for integrated storage is %d", physical.ErrKeyTooLarge, bolt.MaxKeySize)
|
|
}
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
command := &LogData{
|
|
Operations: []*LogOperation{
|
|
{
|
|
OpType: putOp,
|
|
Key: entry.Key,
|
|
Value: entry.Value,
|
|
},
|
|
},
|
|
}
|
|
|
|
b.permitPool.Acquire()
|
|
defer b.permitPool.Release()
|
|
|
|
b.l.RLock()
|
|
err := b.applyLog(ctx, command)
|
|
b.l.RUnlock()
|
|
return err
|
|
}
|
|
|
|
// List enumerates all the items under the prefix from the fsm
|
|
func (b *RaftBackend) List(ctx context.Context, prefix string) ([]string, error) {
|
|
defer metrics.MeasureSince([]string{"raft-storage", "list"}, time.Now())
|
|
if b.fsm == nil {
|
|
return nil, errors.New("raft: fsm not configured")
|
|
}
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
b.permitPool.Acquire()
|
|
defer b.permitPool.Release()
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return b.fsm.List(ctx, prefix)
|
|
}
|
|
|
|
// Transaction applies all the given operations into a single log and
|
|
// applies it.
|
|
func (b *RaftBackend) Transaction(ctx context.Context, txns []*physical.TxnEntry) error {
|
|
defer metrics.MeasureSince([]string{"raft-storage", "transaction"}, time.Now())
|
|
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
command := &LogData{
|
|
Operations: make([]*LogOperation, len(txns)),
|
|
}
|
|
for i, txn := range txns {
|
|
op := &LogOperation{}
|
|
switch txn.Operation {
|
|
case physical.PutOperation:
|
|
if len(txn.Entry.Key) > bolt.MaxKeySize {
|
|
return fmt.Errorf("%s, max key size for integrated storage is %d", physical.ErrKeyTooLarge, bolt.MaxKeySize)
|
|
}
|
|
op.OpType = putOp
|
|
op.Key = txn.Entry.Key
|
|
op.Value = txn.Entry.Value
|
|
case physical.DeleteOperation:
|
|
op.OpType = deleteOp
|
|
op.Key = txn.Entry.Key
|
|
default:
|
|
return fmt.Errorf("%q is not a supported transaction operation", txn.Operation)
|
|
}
|
|
|
|
command.Operations[i] = op
|
|
}
|
|
|
|
b.permitPool.Acquire()
|
|
defer b.permitPool.Release()
|
|
|
|
b.l.RLock()
|
|
err := b.applyLog(ctx, command)
|
|
b.l.RUnlock()
|
|
return err
|
|
}
|
|
|
|
// applyLog will take a given log command and apply it to the raft log. applyLog
|
|
// doesn't return until the log has been applied to a quorum of servers and is
|
|
// persisted to the local FSM. Caller should hold the backend's read lock.
|
|
func (b *RaftBackend) applyLog(ctx context.Context, command *LogData) error {
|
|
if b.raft == nil {
|
|
return errors.New("raft storage is not initialized")
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
commandBytes, err := proto.Marshal(command)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
cmdSize := len(commandBytes)
|
|
if uint64(cmdSize) > b.maxEntrySize {
|
|
return fmt.Errorf("%s; got %d bytes, max: %d bytes", physical.ErrValueTooLarge, cmdSize, b.maxEntrySize)
|
|
}
|
|
|
|
defer metrics.AddSample([]string{"raft-storage", "entry_size"}, float32(cmdSize))
|
|
|
|
var chunked bool
|
|
var applyFuture raft.ApplyFuture
|
|
switch {
|
|
case len(commandBytes) <= raftchunking.ChunkSize:
|
|
applyFuture = b.raft.Apply(commandBytes, 0)
|
|
default:
|
|
chunked = true
|
|
applyFuture = raftchunking.ChunkingApply(commandBytes, nil, 0, b.raft.ApplyLog)
|
|
}
|
|
|
|
if err := applyFuture.Error(); err != nil {
|
|
return err
|
|
}
|
|
|
|
resp := applyFuture.Response()
|
|
|
|
if chunked {
|
|
// In this case we didn't apply all chunks successfully, possibly due
|
|
// to a term change
|
|
if resp == nil {
|
|
// This returns the error in the interface because the raft library
|
|
// returns errors from the FSM via the future, not via err from the
|
|
// apply function. Downstream client code expects to see any error
|
|
// from the FSM (as opposed to the apply itself) and decide whether
|
|
// it can retry in the future's response.
|
|
return errors.New("applying chunking failed, please retry")
|
|
}
|
|
|
|
// We expect that this conversion should always work
|
|
chunkedSuccess, ok := resp.(raftchunking.ChunkingSuccess)
|
|
if !ok {
|
|
return errors.New("unknown type of response back from chunking FSM")
|
|
}
|
|
|
|
// Replace the reply with the inner wrapped version
|
|
resp = chunkedSuccess.Response
|
|
}
|
|
|
|
if resp, ok := resp.(*FSMApplyResponse); !ok || !resp.Success {
|
|
return errors.New("could not apply data")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// HAEnabled is the implementation of the HABackend interface
|
|
func (b *RaftBackend) HAEnabled() bool { return true }
|
|
|
|
// HAEnabled is the implementation of the HABackend interface
|
|
func (b *RaftBackend) LockWith(key, value string) (physical.Lock, error) {
|
|
return &RaftLock{
|
|
key: key,
|
|
value: []byte(value),
|
|
b: b,
|
|
}, nil
|
|
}
|
|
|
|
// SetDesiredSuffrage sets a field in the fsm indicating the suffrage intent for
|
|
// this node.
|
|
func (b *RaftBackend) SetDesiredSuffrage(nonVoter bool) error {
|
|
b.l.Lock()
|
|
defer b.l.Unlock()
|
|
|
|
var desiredSuffrage string
|
|
switch nonVoter {
|
|
case true:
|
|
desiredSuffrage = "non-voter"
|
|
default:
|
|
desiredSuffrage = "voter"
|
|
}
|
|
|
|
err := b.fsm.recordSuffrage(desiredSuffrage)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b *RaftBackend) DesiredSuffrage() string {
|
|
return b.fsm.DesiredSuffrage()
|
|
}
|
|
|
|
// RaftLock implements the physical Lock interface and enables HA for this
|
|
// backend. The Lock uses the raftNotifyCh for receiving leadership edge
|
|
// triggers. Vault's active duty matches raft's leadership.
|
|
type RaftLock struct {
|
|
key string
|
|
value []byte
|
|
|
|
b *RaftBackend
|
|
}
|
|
|
|
// monitorLeadership waits until we receive an update on the raftNotifyCh and
|
|
// closes the leaderLost channel.
|
|
func (l *RaftLock) monitorLeadership(stopCh <-chan struct{}, leaderNotifyCh <-chan bool) <-chan struct{} {
|
|
leaderLost := make(chan struct{})
|
|
go func() {
|
|
for {
|
|
select {
|
|
case isLeader := <-leaderNotifyCh:
|
|
// leaderNotifyCh may deliver a true value initially if this
|
|
// server is already the leader prior to RaftLock.Lock call
|
|
// (the true message was already queued). The next message is
|
|
// always going to be false. The for loop should loop at most
|
|
// twice.
|
|
if !isLeader {
|
|
close(leaderLost)
|
|
return
|
|
}
|
|
case <-stopCh:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
return leaderLost
|
|
}
|
|
|
|
// Lock blocks until we become leader or are shutdown. It returns a channel that
|
|
// is closed when we detect a loss of leadership.
|
|
func (l *RaftLock) Lock(stopCh <-chan struct{}) (<-chan struct{}, error) {
|
|
// If not initialized, block until it is
|
|
if !l.b.Initialized() {
|
|
select {
|
|
case <-l.b.raftInitCh:
|
|
case <-stopCh:
|
|
return nil, nil
|
|
}
|
|
}
|
|
|
|
l.b.l.RLock()
|
|
|
|
// Ensure that we still have a raft instance after grabbing the read lock
|
|
if l.b.raft == nil {
|
|
l.b.l.RUnlock()
|
|
return nil, errors.New("attempted to grab a lock on a nil raft backend")
|
|
}
|
|
|
|
// Cache the notifyCh locally
|
|
leaderNotifyCh := l.b.raftNotifyCh
|
|
|
|
// Check to see if we are already leader.
|
|
if l.b.raft.State() == raft.Leader {
|
|
err := l.b.applyLog(context.Background(), &LogData{
|
|
Operations: []*LogOperation{
|
|
{
|
|
OpType: putOp,
|
|
Key: l.key,
|
|
Value: l.value,
|
|
},
|
|
},
|
|
})
|
|
l.b.l.RUnlock()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return l.monitorLeadership(stopCh, leaderNotifyCh), nil
|
|
}
|
|
l.b.l.RUnlock()
|
|
|
|
for {
|
|
select {
|
|
case isLeader := <-leaderNotifyCh:
|
|
if isLeader {
|
|
// We are leader, set the key
|
|
l.b.l.RLock()
|
|
err := l.b.applyLog(context.Background(), &LogData{
|
|
Operations: []*LogOperation{
|
|
{
|
|
OpType: putOp,
|
|
Key: l.key,
|
|
Value: l.value,
|
|
},
|
|
},
|
|
})
|
|
l.b.l.RUnlock()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return l.monitorLeadership(stopCh, leaderNotifyCh), nil
|
|
}
|
|
case <-stopCh:
|
|
return nil, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
// Unlock gives up leadership.
|
|
func (l *RaftLock) Unlock() error {
|
|
if l.b.raft == nil {
|
|
return nil
|
|
}
|
|
|
|
return l.b.raft.LeadershipTransfer().Error()
|
|
}
|
|
|
|
// Value reads the value of the lock. This informs us who is currently leader.
|
|
func (l *RaftLock) Value() (bool, string, error) {
|
|
e, err := l.b.Get(context.Background(), l.key)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
if e == nil {
|
|
return false, "", nil
|
|
}
|
|
|
|
value := string(e.Value)
|
|
// TODO: how to tell if held?
|
|
return true, value, nil
|
|
}
|
|
|
|
// sealer implements the snapshot.Sealer interface and is used in the snapshot
|
|
// process for encrypting/decrypting the SHASUM file in snapshot archives.
|
|
type sealer struct {
|
|
access *seal.Access
|
|
}
|
|
|
|
// Seal encrypts the data with using the seal access object.
|
|
func (s sealer) Seal(ctx context.Context, pt []byte) ([]byte, error) {
|
|
if s.access == nil {
|
|
return nil, errors.New("no seal access available")
|
|
}
|
|
eblob, err := s.access.Encrypt(ctx, pt, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return proto.Marshal(eblob)
|
|
}
|
|
|
|
// Open decrypts the data using the seal access object.
|
|
func (s sealer) Open(ctx context.Context, ct []byte) ([]byte, error) {
|
|
if s.access == nil {
|
|
return nil, errors.New("no seal access available")
|
|
}
|
|
|
|
var eblob wrapping.EncryptedBlobInfo
|
|
err := proto.Unmarshal(ct, &eblob)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s.access.Decrypt(ctx, &eblob, nil)
|
|
}
|
|
|
|
// boltOptions returns a bolt.Options struct, suitable for passing to
|
|
// bolt.Open(), pre-configured with all of our preferred defaults.
|
|
func boltOptions(path string) *bolt.Options {
|
|
o := &bolt.Options{
|
|
Timeout: 1 * time.Second,
|
|
FreelistType: bolt.FreelistMapType,
|
|
NoFreelistSync: true,
|
|
MmapFlags: getMmapFlags(path),
|
|
}
|
|
|
|
if os.Getenv("VAULT_RAFT_FREELIST_TYPE") == "array" {
|
|
o.FreelistType = bolt.FreelistArrayType
|
|
}
|
|
|
|
if os.Getenv("VAULT_RAFT_FREELIST_SYNC") != "" {
|
|
o.NoFreelistSync = false
|
|
}
|
|
|
|
// By default, we want to set InitialMmapSize to 100GB, but only on 64bit platforms.
|
|
// Otherwise, we set it to whatever the value of VAULT_RAFT_INITIAL_MMAP_SIZE
|
|
// is, assuming it can be parsed as an int. Bolt itself sets this to 0 by default,
|
|
// so if users are wanting to turn this off, they can also set it to 0. Setting it
|
|
// to a negative value is the same as not setting it at all.
|
|
if os.Getenv("VAULT_RAFT_INITIAL_MMAP_SIZE") == "" {
|
|
o.InitialMmapSize = initialMmapSize
|
|
} else {
|
|
imms, err := strconv.Atoi(os.Getenv("VAULT_RAFT_INITIAL_MMAP_SIZE"))
|
|
|
|
// If there's an error here, it means they passed something that's not convertible to
|
|
// a number. Rather than fail startup, just ignore it.
|
|
if err == nil && imms > 0 {
|
|
o.InitialMmapSize = imms
|
|
}
|
|
}
|
|
|
|
return o
|
|
}
|