open-vault/physical/raft/raft.go
Vishal Nayak 8891f2ba88 Raft retry join (#7856)
* Raft retry join

* update

* Make retry join work with shamir seal

* Return upon context completion

* Update vault/raft.go

Co-Authored-By: Brian Kassouf <briankassouf@users.noreply.github.com>

* Address some review comments

* send leader information slice as a parameter

* Make retry join work properly with Shamir case. This commit has a blocking issue

* Fix join goroutine exiting before the job is done

* Polishing changes

* Don't return after a successful join during unseal

* Added config parsing test

* Add test and fix bugs

* minor changes

* Address review comments

* Fix build error

Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2020-01-13 17:02:16 -08:00

1131 lines
31 KiB
Go

package raft
import (
"context"
"crypto/tls"
"errors"
"fmt"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
"github.com/hashicorp/vault/sdk/helper/tlsutil"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"sync"
"time"
"github.com/armon/go-metrics"
"github.com/golang/protobuf/proto"
"github.com/hashicorp/errwrap"
log "github.com/hashicorp/go-hclog"
wrapping "github.com/hashicorp/go-kms-wrapping"
"github.com/hashicorp/go-raftchunking"
"github.com/hashicorp/go-uuid"
"github.com/hashicorp/raft"
snapshot "github.com/hashicorp/raft-snapshot"
raftboltdb "github.com/hashicorp/vault/physical/raft/logstore"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/vault/cluster"
"github.com/hashicorp/vault/vault/seal"
"github.com/hashicorp/vault/sdk/physical"
)
// EnvVaultRaftNodeID is used to fetch the Raft node ID from the environment.
const EnvVaultRaftNodeID = "VAULT_RAFT_NODE_ID"
// EnvVaultRaftPath is used to fetch the path where Raft data is stored from the environment.
const EnvVaultRaftPath = "VAULT_RAFT_PATH"
// Verify RaftBackend satisfies the correct interfaces
var _ physical.Backend = (*RaftBackend)(nil)
var _ physical.Transactional = (*RaftBackend)(nil)
var (
// raftLogCacheSize is the maximum number of logs to cache in-memory.
// This is used to reduce disk I/O for the recently committed entries.
raftLogCacheSize = 512
raftState = "raft/"
peersFileName = "peers.json"
snapshotsRetained = 2
restoreOpDelayDuration = 5 * time.Second
)
// RaftBackend implements the backend interfaces and uses the raft protocol to
// persist writes to the FSM.
type RaftBackend struct {
logger log.Logger
conf map[string]string
l sync.RWMutex
// fsm is the state store for vault's data
fsm *FSM
// raft is the instance of raft we will operate on.
raft *raft.Raft
// raftNotifyCh is used to receive updates about leadership changes
// regarding this node.
raftNotifyCh chan bool
// streamLayer is the network layer used to connect the nodes in the raft
// cluster.
streamLayer *raftLayer
// raftTransport is the transport layer that the raft library uses for RPC
// communication.
raftTransport raft.Transport
// snapStore is our snapshot mechanism.
snapStore raft.SnapshotStore
// logStore is used by the raft library to store the raft logs in durable
// storage.
logStore raft.LogStore
// stableStore is used by the raft library to store additional metadata in
// durable storage.
stableStore raft.StableStore
// bootstrapConfig is only set when this node needs to be bootstrapped upon
// startup.
bootstrapConfig *raft.Configuration
// dataDir is the location on the local filesystem that raft and FSM data
// will be stored.
dataDir string
// localID is the ID for this node. This can either be configured in the
// config file, via a file on disk, or is otherwise randomly generated.
localID string
// serverAddressProvider is used to map server IDs to addresses.
serverAddressProvider raft.ServerAddressProvider
// permitPool is used to limit the number of concurrent storage calls.
permitPool *physical.PermitPool
}
// LeaderJoinInfo contains information required by a node to join itself as a
// follower to an existing raft cluster
type LeaderJoinInfo struct {
// LeaderAPIAddr is the address of the leader node to connect to
LeaderAPIAddr string `json:"leader_api_addr"`
// LeaderCACert is the CA cert of the leader node
LeaderCACert string `json:"leader_ca_cert"`
// LeaderClientCert is the client certificate for the follower node to establish
// client authentication during TLS
LeaderClientCert string `json:"leader_client_cert"`
// LeaderClientKey is the client key for the follower node to establish client
// authentication during TLS
LeaderClientKey string `json:"leader_client_key"`
// Retry indicates if the join process should automatically be retried
Retry bool `json:"-"`
// TLSConfig for the API client to use when communicating with the leader node
TLSConfig *tls.Config `json:"-"`
}
// JoinConfig returns a list of information about possible leader nodes that
// this node can join as a follower
func (b *RaftBackend) JoinConfig() ([]*LeaderJoinInfo, error) {
config := b.conf["retry_join"]
if config == "" {
return nil, nil
}
var leaderInfos []*LeaderJoinInfo
err := jsonutil.DecodeJSON([]byte(config), &leaderInfos)
if err != nil {
return nil, errwrap.Wrapf("failed to decode retry_join config: {{err}}", err)
}
if len(leaderInfos) == 0 {
return nil, errors.New("invalid retry_join config")
}
for _, info := range leaderInfos {
info.Retry = true
var tlsConfig *tls.Config
var err error
if len(info.LeaderCACert) != 0 || len(info.LeaderClientCert) != 0 || len(info.LeaderClientKey) != 0 {
tlsConfig, err = tlsutil.ClientTLSConfig([]byte(info.LeaderCACert), []byte(info.LeaderClientCert), []byte(info.LeaderClientKey))
if err != nil {
return nil, errwrap.Wrapf(fmt.Sprintf("failed to create tls config to communicate with leader node %q: {{err}}", info.LeaderAPIAddr), err)
}
}
info.TLSConfig = tlsConfig
}
return leaderInfos, nil
}
// EnsurePath is used to make sure a path exists
func EnsurePath(path string, dir bool) error {
if !dir {
path = filepath.Dir(path)
}
return os.MkdirAll(path, 0755)
}
// NewRaftBackend constructs a RaftBackend using the given directory
func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend, error) {
// Create the FSM.
fsm, err := NewFSM(conf, logger.Named("fsm"))
if err != nil {
return nil, fmt.Errorf("failed to create fsm: %v", err)
}
path := os.Getenv(EnvVaultRaftPath)
if path == "" {
pathFromConfig, ok := conf["path"]
if !ok {
return nil, fmt.Errorf("'path' must be set")
}
path = pathFromConfig
}
// Build an all in-memory setup for dev mode, otherwise prepare a full
// disk-based setup.
var log raft.LogStore
var stable raft.StableStore
var snap raft.SnapshotStore
var devMode bool
if devMode {
store := raft.NewInmemStore()
stable = store
log = store
snap = raft.NewInmemSnapshotStore()
} else {
// Create the base raft path.
path := filepath.Join(path, raftState)
if err := EnsurePath(path, true); err != nil {
return nil, err
}
// Create the backend raft store for logs and stable storage.
store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db"))
if err != nil {
return nil, err
}
stable = store
// Wrap the store in a LogCache to improve performance.
cacheStore, err := raft.NewLogCache(raftLogCacheSize, store)
if err != nil {
return nil, err
}
log = cacheStore
// Create the snapshot store.
snapshots, err := NewBoltSnapshotStore(path, snapshotsRetained, logger.Named("snapshot"), fsm)
if err != nil {
return nil, err
}
snap = snapshots
}
var localID string
{
// Determine the local node ID from the environment.
if raftNodeID := os.Getenv(EnvVaultRaftNodeID); raftNodeID != "" {
localID = raftNodeID
}
// If not set in the environment check the configuration file.
if len(localID) == 0 {
localID = conf["node_id"]
}
// If not set in the config check the "node-id" file.
if len(localID) == 0 {
localIDRaw, err := ioutil.ReadFile(filepath.Join(path, "node-id"))
switch {
case err == nil:
if len(localIDRaw) > 0 {
localID = string(localIDRaw)
}
case os.IsNotExist(err):
default:
return nil, err
}
}
// If all of the above fails generate a UUID and persist it to the
// "node-id" file.
if len(localID) == 0 {
id, err := uuid.GenerateUUID()
if err != nil {
return nil, err
}
if err := ioutil.WriteFile(filepath.Join(path, "node-id"), []byte(id), 0600); err != nil {
return nil, err
}
localID = id
}
}
return &RaftBackend{
logger: logger,
fsm: fsm,
conf: conf,
logStore: log,
stableStore: stable,
snapStore: snap,
dataDir: path,
localID: localID,
permitPool: physical.NewPermitPool(physical.DefaultParallelOperations),
}, nil
}
// RaftServer has information about a server in the Raft configuration
type RaftServer struct {
// NodeID is the name of the server
NodeID string `json:"node_id"`
// Address is the IP:port of the server, used for Raft communications
Address string `json:"address"`
// Leader is true if this server is the current cluster leader
Leader bool `json:"leader"`
// Protocol version is the raft protocol version used by the server
ProtocolVersion string `json:"protocol_version"`
// Voter is true if this server has a vote in the cluster. This might
// be false if the server is staging and still coming online.
Voter bool `json:"voter"`
}
// RaftConfigurationResponse is returned when querying for the current Raft
// configuration.
type RaftConfigurationResponse struct {
// Servers has the list of servers in the Raft configuration.
Servers []*RaftServer `json:"servers"`
// Index has the Raft index of this configuration.
Index uint64 `json:"index"`
}
// Peer defines the ID and Address for a given member of the raft cluster.
type Peer struct {
ID string `json:"id"`
Address string `json:"address"`
}
// NodeID returns the identifier of the node
func (b *RaftBackend) NodeID() string {
return b.localID
}
// Initialized tells if raft is running or not
func (b *RaftBackend) Initialized() bool {
b.l.RLock()
init := b.raft != nil
b.l.RUnlock()
return init
}
// SetTLSKeyring is used to install a new keyring. If the active key has changed
// it will also close any network connections or streams forcing a reconnect
// with the new key.
func (b *RaftBackend) SetTLSKeyring(keyring *TLSKeyring) error {
b.l.RLock()
err := b.streamLayer.setTLSKeyring(keyring)
b.l.RUnlock()
return err
}
// SetServerAddressProvider sets a the address provider for determining the raft
// node addresses. This is currently only used in tests.
func (b *RaftBackend) SetServerAddressProvider(provider raft.ServerAddressProvider) {
b.l.Lock()
b.serverAddressProvider = provider
b.l.Unlock()
}
// Bootstrap prepares the given peers to be part of the raft cluster
func (b *RaftBackend) Bootstrap(ctx context.Context, peers []Peer) error {
b.l.Lock()
defer b.l.Unlock()
hasState, err := raft.HasExistingState(b.logStore, b.stableStore, b.snapStore)
if err != nil {
return err
}
if hasState {
return errors.New("error bootstrapping cluster: cluster already has state")
}
raftConfig := &raft.Configuration{
Servers: make([]raft.Server, len(peers)),
}
for i, p := range peers {
raftConfig.Servers[i] = raft.Server{
ID: raft.ServerID(p.ID),
Address: raft.ServerAddress(p.Address),
}
}
// Store the config for later use
b.bootstrapConfig = raftConfig
return nil
}
// SetRestoreCallback sets the callback to be used when a restoreCallbackOp is
// processed through the FSM.
func (b *RaftBackend) SetRestoreCallback(restoreCb restoreCallback) {
b.fsm.l.Lock()
b.fsm.restoreCb = restoreCb
b.fsm.l.Unlock()
}
func (b *RaftBackend) applyConfigSettings(config *raft.Config) error {
config.Logger = b.logger
multiplierRaw, ok := b.conf["performance_multiplier"]
multiplier := 5
if ok {
var err error
multiplier, err = strconv.Atoi(multiplierRaw)
if err != nil {
return err
}
}
config.ElectionTimeout = config.ElectionTimeout * time.Duration(multiplier)
config.HeartbeatTimeout = config.HeartbeatTimeout * time.Duration(multiplier)
config.LeaderLeaseTimeout = config.LeaderLeaseTimeout * time.Duration(multiplier)
snapThresholdRaw, ok := b.conf["snapshot_threshold"]
if ok {
var err error
snapThreshold, err := strconv.Atoi(snapThresholdRaw)
if err != nil {
return err
}
config.SnapshotThreshold = uint64(snapThreshold)
}
trailingLogsRaw, ok := b.conf["trailing_logs"]
if ok {
var err error
trailingLogs, err := strconv.Atoi(trailingLogsRaw)
if err != nil {
return err
}
config.TrailingLogs = uint64(trailingLogs)
}
config.NoSnapshotRestoreOnStart = true
config.MaxAppendEntries = 64
return nil
}
// SetupOpts are used to pass options to the raft setup function.
type SetupOpts struct {
// TLSKeyring is the keyring to use for the cluster traffic.
TLSKeyring *TLSKeyring
// ClusterListener is the cluster hook used to register the raft handler and
// client with core's cluster listeners.
ClusterListener cluster.ClusterHook
// StartAsLeader is used to specify this node should start as leader and
// bypass the leader election. This should be used with caution.
StartAsLeader bool
// RecoveryModeConfig is the configuration for the raft cluster in recovery
// mode.
RecoveryModeConfig *raft.Configuration
}
func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
recoveryModeConfig := &raft.Configuration{
Servers: []raft.Server{
{
ID: raft.ServerID(peer.ID),
Address: raft.ServerAddress(peer.Address),
},
},
}
return b.SetupCluster(context.Background(), SetupOpts{
StartAsLeader: true,
RecoveryModeConfig: recoveryModeConfig,
})
}
// SetupCluster starts the raft cluster and enables the networking needed for
// the raft nodes to communicate.
func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
b.logger.Trace("setting up raft cluster")
b.l.Lock()
defer b.l.Unlock()
// We are already unsealed
if b.raft != nil {
b.logger.Debug("raft already started, not setting up cluster")
return nil
}
if len(b.localID) == 0 {
return errors.New("no local node id configured")
}
// Setup the raft config
raftConfig := raft.DefaultConfig()
if err := b.applyConfigSettings(raftConfig); err != nil {
return err
}
switch {
case opts.TLSKeyring == nil && opts.ClusterListener == nil:
// If we don't have a provided network we use an in-memory one.
// This allows us to bootstrap a node without bringing up a cluster
// network. This will be true during bootstrap, tests and dev modes.
_, b.raftTransport = raft.NewInmemTransportWithTimeout(raft.ServerAddress(b.localID), time.Second)
case opts.TLSKeyring == nil:
return errors.New("no keyring provided")
case opts.ClusterListener == nil:
return errors.New("no cluster listener provided")
default:
// Load the base TLS config from the cluster listener.
baseTLSConfig, err := opts.ClusterListener.TLSConfig(ctx)
if err != nil {
return err
}
// Set the local address and localID in the streaming layer and the raft config.
streamLayer, err := NewRaftLayer(b.logger.Named("stream"), opts.TLSKeyring, opts.ClusterListener.Addr(), baseTLSConfig)
if err != nil {
return err
}
transConfig := &raft.NetworkTransportConfig{
Stream: streamLayer,
MaxPool: 3,
Timeout: 10 * time.Second,
ServerAddressProvider: b.serverAddressProvider,
}
transport := raft.NewNetworkTransportWithConfig(transConfig)
b.streamLayer = streamLayer
b.raftTransport = transport
}
raftConfig.LocalID = raft.ServerID(b.localID)
// Set up a channel for reliable leader notifications.
raftNotifyCh := make(chan bool, 1)
raftConfig.NotifyCh = raftNotifyCh
// If we have a bootstrapConfig set we should bootstrap now.
if b.bootstrapConfig != nil {
bootstrapConfig := b.bootstrapConfig
// Unset the bootstrap config
b.bootstrapConfig = nil
// Bootstrap raft with our known cluster members.
if err := raft.BootstrapCluster(raftConfig, b.logStore, b.stableStore, b.snapStore, b.raftTransport, *bootstrapConfig); err != nil {
return err
}
// If we are the only node we should start as the leader.
if len(bootstrapConfig.Servers) == 1 {
opts.StartAsLeader = true
}
}
raftConfig.StartAsLeader = opts.StartAsLeader
// Setup the Raft store.
b.fsm.SetNoopRestore(true)
raftPath := filepath.Join(b.dataDir, raftState)
peersFile := filepath.Join(raftPath, peersFileName)
_, err := os.Stat(peersFile)
if err == nil {
b.logger.Info("raft recovery initiated", "recovery_file", peersFileName)
recoveryConfig, err := raft.ReadConfigJSON(peersFile)
if err != nil {
return errwrap.Wrapf("raft recovery failed to parse peers.json: {{err}}", err)
}
b.logger.Info("raft recovery: found new config", "config", recoveryConfig)
err = raft.RecoverCluster(raftConfig, b.fsm, b.logStore, b.stableStore, b.snapStore, b.raftTransport, recoveryConfig)
if err != nil {
return errwrap.Wrapf("raft recovery failed: {{err}}", err)
}
err = os.Remove(peersFile)
if err != nil {
return errwrap.Wrapf("raft recovery failed to delete peers.json; please delete manually: {{err}}", err)
}
b.logger.Info("raft recovery deleted peers.json")
}
if opts.RecoveryModeConfig != nil {
err = raft.RecoverCluster(raftConfig, b.fsm, b.logStore, b.stableStore, b.snapStore, b.raftTransport, *opts.RecoveryModeConfig)
if err != nil {
return errwrap.Wrapf("recovering raft cluster failed: {{err}}", err)
}
}
raftObj, err := raft.NewRaft(raftConfig, b.fsm.chunker, b.logStore, b.stableStore, b.snapStore, b.raftTransport)
b.fsm.SetNoopRestore(false)
if err != nil {
return err
}
b.raft = raftObj
b.raftNotifyCh = raftNotifyCh
if b.streamLayer != nil {
// Add Handler to the cluster.
opts.ClusterListener.AddHandler(consts.RaftStorageALPN, b.streamLayer)
// Add Client to the cluster.
opts.ClusterListener.AddClient(consts.RaftStorageALPN, b.streamLayer)
}
return nil
}
// TeardownCluster shuts down the raft cluster
func (b *RaftBackend) TeardownCluster(clusterListener cluster.ClusterHook) error {
if clusterListener != nil {
clusterListener.StopHandler(consts.RaftStorageALPN)
clusterListener.RemoveClient(consts.RaftStorageALPN)
}
b.l.Lock()
future := b.raft.Shutdown()
b.raft = nil
b.l.Unlock()
return future.Error()
}
// AppliedIndex returns the latest index applied to the FSM
func (b *RaftBackend) AppliedIndex() uint64 {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return 0
}
return b.raft.AppliedIndex()
}
// RemovePeer removes the given peer ID from the raft cluster. If the node is
// ourselves we will give up leadership.
func (b *RaftBackend) RemovePeer(ctx context.Context, peerID string) error {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return errors.New("raft storage is not initialized")
}
future := b.raft.RemoveServer(raft.ServerID(peerID), 0, 0)
return future.Error()
}
func (b *RaftBackend) GetConfiguration(ctx context.Context) (*RaftConfigurationResponse, error) {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return nil, errors.New("raft storage is not initialized")
}
future := b.raft.GetConfiguration()
if err := future.Error(); err != nil {
return nil, err
}
config := &RaftConfigurationResponse{
Index: future.Index(),
}
for _, server := range future.Configuration().Servers {
entry := &RaftServer{
NodeID: string(server.ID),
Address: string(server.Address),
// Since we only service this request on the active node our node ID
// denotes the raft leader.
Leader: string(server.ID) == b.NodeID(),
Voter: server.Suffrage == raft.Voter,
ProtocolVersion: strconv.Itoa(raft.ProtocolVersionMax),
}
config.Servers = append(config.Servers, entry)
}
return config, nil
}
// AddPeer adds a new server to the raft cluster
func (b *RaftBackend) AddPeer(ctx context.Context, peerID, clusterAddr string) error {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return errors.New("raft storage is not initialized")
}
b.logger.Debug("adding raft peer", "node_id", peerID, "cluster_addr", clusterAddr)
future := b.raft.AddVoter(raft.ServerID(peerID), raft.ServerAddress(clusterAddr), 0, 0)
return future.Error()
}
// Peers returns all the servers present in the raft cluster
func (b *RaftBackend) Peers(ctx context.Context) ([]Peer, error) {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return nil, errors.New("raft storage backend is not initialized")
}
future := b.raft.GetConfiguration()
if err := future.Error(); err != nil {
return nil, err
}
ret := make([]Peer, len(future.Configuration().Servers))
for i, s := range future.Configuration().Servers {
ret[i] = Peer{
ID: string(s.ID),
Address: string(s.Address),
}
}
return ret, nil
}
// Snapshot takes a raft snapshot, packages it into a archive file and writes it
// to the provided writer. Seal access is used to encrypt the SHASUM file so we
// can validate the snapshot was taken using the same master keys or not.
func (b *RaftBackend) Snapshot(out *logical.HTTPResponseWriter, access *seal.Access) error {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return errors.New("raft storage backend is sealed")
}
// If we have access to the seal create a sealer object
var s snapshot.Sealer
if access != nil {
s = &sealer{
access: access,
}
}
snap, err := snapshot.NewWithSealer(b.logger.Named("snapshot"), b.raft, s)
if err != nil {
return err
}
defer snap.Close()
size, err := snap.Size()
if err != nil {
return err
}
out.Header().Add("Content-Disposition", "attachment")
out.Header().Add("Content-Length", fmt.Sprintf("%d", size))
out.Header().Add("Content-Type", "application/gzip")
_, err = io.Copy(out, snap)
if err != nil {
return err
}
return nil
}
// WriteSnapshotToTemp reads a snapshot archive off the provided reader,
// extracts the data and writes the snapshot to a temporary file. The seal
// access is used to decrypt the SHASUM file in the archive to ensure this
// snapshot has the same master key as the running instance. If the provided
// access is nil then it will skip that validation.
func (b *RaftBackend) WriteSnapshotToTemp(in io.ReadCloser, access *seal.Access) (*os.File, func(), raft.SnapshotMeta, error) {
b.l.RLock()
defer b.l.RUnlock()
var metadata raft.SnapshotMeta
if b.raft == nil {
return nil, nil, metadata, errors.New("raft storage backend is sealed")
}
// If we have access to the seal create a sealer object
var s snapshot.Sealer
if access != nil {
s = &sealer{
access: access,
}
}
snap, cleanup, err := snapshot.WriteToTempFileWithSealer(b.logger.Named("snapshot"), in, &metadata, s)
return snap, cleanup, metadata, err
}
// RestoreSnapshot applies the provided snapshot metadata and snapshot data to
// raft.
func (b *RaftBackend) RestoreSnapshot(ctx context.Context, metadata raft.SnapshotMeta, snap io.Reader) error {
b.l.RLock()
defer b.l.RUnlock()
if b.raft == nil {
return errors.New("raft storage is not initialized")
}
if err := b.raft.Restore(&metadata, snap, 0); err != nil {
b.logger.Named("snapshot").Error("failed to restore snapshot", "error", err)
return err
}
// Apply a log that tells the follower nodes to run the restore callback
// function. This is done after the restore call so we can be sure the
// snapshot applied to a quorum of nodes.
command := &LogData{
Operations: []*LogOperation{
&LogOperation{
OpType: restoreCallbackOp,
},
},
}
b.l.RLock()
err := b.applyLog(ctx, command)
b.l.RUnlock()
// Do a best-effort attempt to let the standbys apply the restoreCallbackOp
// before we continue.
time.Sleep(restoreOpDelayDuration)
return err
}
// Delete inserts an entry in the log to delete the given path
func (b *RaftBackend) Delete(ctx context.Context, path string) error {
defer metrics.MeasureSince([]string{"raft-storage", "delete"}, time.Now())
command := &LogData{
Operations: []*LogOperation{
&LogOperation{
OpType: deleteOp,
Key: path,
},
},
}
b.permitPool.Acquire()
defer b.permitPool.Release()
b.l.RLock()
err := b.applyLog(ctx, command)
b.l.RUnlock()
return err
}
// Get returns the value corresponding to the given path from the fsm
func (b *RaftBackend) Get(ctx context.Context, path string) (*physical.Entry, error) {
defer metrics.MeasureSince([]string{"raft-storage", "get"}, time.Now())
if b.fsm == nil {
return nil, errors.New("raft: fsm not configured")
}
b.permitPool.Acquire()
defer b.permitPool.Release()
return b.fsm.Get(ctx, path)
}
// Put inserts an entry in the log for the put operation
func (b *RaftBackend) Put(ctx context.Context, entry *physical.Entry) error {
defer metrics.MeasureSince([]string{"raft-storage", "put"}, time.Now())
command := &LogData{
Operations: []*LogOperation{
&LogOperation{
OpType: putOp,
Key: entry.Key,
Value: entry.Value,
},
},
}
b.permitPool.Acquire()
defer b.permitPool.Release()
b.l.RLock()
err := b.applyLog(ctx, command)
b.l.RUnlock()
return err
}
// List enumerates all the items under the prefix from the fsm
func (b *RaftBackend) List(ctx context.Context, prefix string) ([]string, error) {
defer metrics.MeasureSince([]string{"raft-storage", "list"}, time.Now())
if b.fsm == nil {
return nil, errors.New("raft: fsm not configured")
}
b.permitPool.Acquire()
defer b.permitPool.Release()
return b.fsm.List(ctx, prefix)
}
// Transaction applies all the given operations into a single log and
// applies it.
func (b *RaftBackend) Transaction(ctx context.Context, txns []*physical.TxnEntry) error {
defer metrics.MeasureSince([]string{"raft-storage", "transaction"}, time.Now())
command := &LogData{
Operations: make([]*LogOperation, len(txns)),
}
for i, txn := range txns {
op := &LogOperation{}
switch txn.Operation {
case physical.PutOperation:
op.OpType = putOp
op.Key = txn.Entry.Key
op.Value = txn.Entry.Value
case physical.DeleteOperation:
op.OpType = deleteOp
op.Key = txn.Entry.Key
default:
return fmt.Errorf("%q is not a supported transaction operation", txn.Operation)
}
command.Operations[i] = op
}
b.permitPool.Acquire()
defer b.permitPool.Release()
b.l.RLock()
err := b.applyLog(ctx, command)
b.l.RUnlock()
return err
}
// applyLog will take a given log command and apply it to the raft log. applyLog
// doesn't return until the log has been applied to a quorum of servers and is
// persisted to the local FSM. Caller should hold the backend's read lock.
func (b *RaftBackend) applyLog(ctx context.Context, command *LogData) error {
if b.raft == nil {
return errors.New("raft storage backend is not initialized")
}
commandBytes, err := proto.Marshal(command)
if err != nil {
return err
}
var chunked bool
var applyFuture raft.ApplyFuture
switch {
case len(commandBytes) <= raftchunking.ChunkSize:
applyFuture = b.raft.Apply(commandBytes, 0)
default:
chunked = true
applyFuture = raftchunking.ChunkingApply(commandBytes, nil, 0, b.raft.ApplyLog)
}
if err := applyFuture.Error(); err != nil {
return err
}
resp := applyFuture.Response()
if chunked {
// In this case we didn't apply all chunks successfully, possibly due
// to a term change
if resp == nil {
// This returns the error in the interface because the raft library
// returns errors from the FSM via the future, not via err from the
// apply function. Downstream client code expects to see any error
// from the FSM (as opposed to the apply itself) and decide whether
// it can retry in the future's response.
return errors.New("applying chunking failed, please retry")
}
// We expect that this conversion should always work
chunkedSuccess, ok := resp.(raftchunking.ChunkingSuccess)
if !ok {
return errors.New("unknown type of response back from chunking FSM")
}
// Replace the reply with the inner wrapped version
resp = chunkedSuccess.Response
}
if resp, ok := resp.(*FSMApplyResponse); !ok || !resp.Success {
return errors.New("could not apply data")
}
return nil
}
// HAEnabled is the implemention of the HABackend interface
func (b *RaftBackend) HAEnabled() bool { return true }
// HAEnabled is the implemention of the HABackend interface
func (b *RaftBackend) LockWith(key, value string) (physical.Lock, error) {
return &RaftLock{
key: key,
value: []byte(value),
b: b,
}, nil
}
// RaftLock implements the physical Lock interface and enables HA for this
// backend. The Lock uses the raftNotifyCh for receiving leadership edge
// triggers. Vault's active duty matches raft's leadership.
type RaftLock struct {
key string
value []byte
b *RaftBackend
}
// monitorLeadership waits until we receive an update on the raftNotifyCh and
// closes the leaderLost channel.
func (l *RaftLock) monitorLeadership(stopCh <-chan struct{}, leaderNotifyCh <-chan bool) <-chan struct{} {
leaderLost := make(chan struct{})
go func() {
select {
case isLeader := <-leaderNotifyCh:
if !isLeader {
close(leaderLost)
}
case <-stopCh:
}
}()
return leaderLost
}
// Lock blocks until we become leader or are shutdown. It returns a channel that
// is closed when we detect a loss of leadership.
func (l *RaftLock) Lock(stopCh <-chan struct{}) (<-chan struct{}, error) {
l.b.l.RLock()
// Cache the notifyCh locally
leaderNotifyCh := l.b.raftNotifyCh
// Check to see if we are already leader.
if l.b.raft.State() == raft.Leader {
err := l.b.applyLog(context.Background(), &LogData{
Operations: []*LogOperation{
&LogOperation{
OpType: putOp,
Key: l.key,
Value: l.value,
},
},
})
l.b.l.RUnlock()
if err != nil {
return nil, err
}
return l.monitorLeadership(stopCh, leaderNotifyCh), nil
}
l.b.l.RUnlock()
for {
select {
case isLeader := <-leaderNotifyCh:
if isLeader {
// We are leader, set the key
l.b.l.RLock()
err := l.b.applyLog(context.Background(), &LogData{
Operations: []*LogOperation{
&LogOperation{
OpType: putOp,
Key: l.key,
Value: l.value,
},
},
})
l.b.l.RUnlock()
if err != nil {
return nil, err
}
return l.monitorLeadership(stopCh, leaderNotifyCh), nil
}
case <-stopCh:
return nil, nil
}
}
return nil, nil
}
// Unlock gives up leadership.
func (l *RaftLock) Unlock() error {
return l.b.raft.LeadershipTransfer().Error()
}
// Value reads the value of the lock. This informs us who is currently leader.
func (l *RaftLock) Value() (bool, string, error) {
e, err := l.b.Get(context.Background(), l.key)
if err != nil {
return false, "", err
}
if e == nil {
return false, "", nil
}
value := string(e.Value)
// TODO: how to tell if held?
return true, value, nil
}
// sealer implements the snapshot.Sealer interface and is used in the snapshot
// process for encrypting/decrypting the SHASUM file in snapshot archives.
type sealer struct {
access *seal.Access
}
// Seal encrypts the data with using the seal access object.
func (s sealer) Seal(ctx context.Context, pt []byte) ([]byte, error) {
if s.access == nil {
return nil, errors.New("no seal access available")
}
eblob, err := s.access.Encrypt(ctx, pt, nil)
if err != nil {
return nil, err
}
return proto.Marshal(eblob)
}
// Open decrypts the data using the seal access object.
func (s sealer) Open(ctx context.Context, ct []byte) ([]byte, error) {
if s.access == nil {
return nil, errors.New("no seal access available")
}
var eblob wrapping.EncryptedBlobInfo
err := proto.Unmarshal(ct, &eblob)
if err != nil {
return nil, err
}
return s.access.Decrypt(ctx, &eblob, nil)
}