open-vault/physical/raft/fsm.go

package raft

import (
	"bytes"
	"context"
	"encoding/hex"
	"errors"
	"fmt"
	"io"
	"os"
	"path/filepath"
	"strconv"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/armon/go-metrics"
	"github.com/golang/protobuf/proto"
	log "github.com/hashicorp/go-hclog"
	"github.com/hashicorp/go-multierror"
	"github.com/hashicorp/go-raftchunking"
	"github.com/hashicorp/go-secure-stdlib/strutil"
	"github.com/hashicorp/raft"
	"github.com/hashicorp/vault/sdk/helper/jsonutil"
	"github.com/hashicorp/vault/sdk/physical"
	"github.com/hashicorp/vault/sdk/plugin/pb"
	bolt "go.etcd.io/bbolt"
)

const (
	deleteOp uint32 = 1 << iota
	putOp
	restoreCallbackOp
	getOp

	chunkingPrefix   = "raftchunking/"
	databaseFilename = "vault.db"
)

var (
	// dataBucketName is the value we use for the bucket
	dataBucketName     = []byte("data")
	configBucketName   = []byte("config")
	latestIndexKey     = []byte("latest_indexes")
	latestConfigKey    = []byte("latest_config")
	localNodeConfigKey = []byte("local_node_config")
)

// Verify FSM satisfies the correct interfaces
var (
	_ physical.Backend       = (*FSM)(nil)
	_ physical.Transactional = (*FSM)(nil)
	_ raft.FSM               = (*FSM)(nil)
	_ raft.BatchingFSM       = (*FSM)(nil)
)

type restoreCallback func(context.Context) error

type FSMEntry struct {
	Key   string
	Value []byte
}

func (f *FSMEntry) String() string {
	return fmt.Sprintf("Key: %s. Value: %s", f.Key, hex.EncodeToString(f.Value))
}

// FSMApplyResponse is returned from an FSM apply. It indicates if the apply was
// successful or not. EntryMap contains the keys/values from the Get operations.
type FSMApplyResponse struct {
	Success    bool
	EntrySlice []*FSMEntry
}

// FSM is Vault's primary state storage. It writes updates to a bolt db file
// that lives on local disk. FSM implements raft.FSM and physical.Backend
// interfaces.
type FSM struct {
	// latestIndex and latestTerm must stay at the top of this struct to be
	// properly 64-bit aligned.

	// latestIndex and latestTerm are the term and index of the last log we
	// received
	latestIndex *uint64
	latestTerm  *uint64
	// latestConfig is the latest server configuration we've seen
	latestConfig atomic.Value

	l           sync.RWMutex
	path        string
	logger      log.Logger
	noopRestore bool

	// applyCallback is used to control the pace of applies in tests
	applyCallback func()

	db *bolt.DB

	// retoreCb is called after we've restored a snapshot
	restoreCb restoreCallback

	chunker *raftchunking.ChunkingBatchingFSM

	localID         string
	desiredSuffrage string
	unknownOpTypes  sync.Map
}

// NewFSM constructs a FSM using the given directory
func NewFSM(path string, localID string, logger log.Logger) (*FSM, error) {
	// Initialize the latest term, index, and config values
	latestTerm := new(uint64)
	latestIndex := new(uint64)
	latestConfig := atomic.Value{}
	atomic.StoreUint64(latestTerm, 0)
	atomic.StoreUint64(latestIndex, 0)
	latestConfig.Store((*ConfigurationValue)(nil))

	f := &FSM{
		path:   path,
		logger: logger,

		latestTerm:   latestTerm,
		latestIndex:  latestIndex,
		latestConfig: latestConfig,
		// Assume that the default intent is to join as as voter. This will be updated
		// when this node joins a cluster with a different suffrage, or during cluster
		// setup if this is already part of a cluster with a desired suffrage.
		desiredSuffrage: "voter",
		localID:         localID,
	}

	f.chunker = raftchunking.NewChunkingBatchingFSM(f, &FSMChunkStorage{
		f:   f,
		ctx: context.Background(),
	})

	dbPath := filepath.Join(path, databaseFilename)
	f.l.Lock()
	defer f.l.Unlock()
	if err := f.openDBFile(dbPath); err != nil {
		return nil, fmt.Errorf("failed to open bolt file: %w", err)
	}

	return f, nil
}

func (f *FSM) getDB() *bolt.DB {
	f.l.RLock()
	defer f.l.RUnlock()

	return f.db
}

// SetFSMDelay adds a delay to the FSM apply. This is used in tests to simulate
// a slow apply.
func (r *RaftBackend) SetFSMDelay(delay time.Duration) {
	r.SetFSMApplyCallback(func() { time.Sleep(delay) })
}

func (r *RaftBackend) SetFSMApplyCallback(f func()) {
	r.fsm.l.Lock()
	r.fsm.applyCallback = f
	r.fsm.l.Unlock()
}

func (f *FSM) openDBFile(dbPath string) error {
	if len(dbPath) == 0 {
		return errors.New("can not open empty filename")
	}

	st, err := os.Stat(dbPath)
	switch {
	case err != nil && os.IsNotExist(err):
	case err != nil:
		return fmt.Errorf("error checking raft FSM db file %q: %v", dbPath, err)
	default:
		perms := st.Mode() & os.ModePerm
		if perms&0o077 != 0 {
			f.logger.Warn("raft FSM db file has wider permissions than needed",
				"needed", os.FileMode(0o600), "existing", perms)
		}
	}

	opts := boltOptions(dbPath)
	start := time.Now()
	boltDB, err := bolt.Open(dbPath, 0o600, opts)
	if err != nil {
		return err
	}
	elapsed := time.Now().Sub(start)
	f.logger.Debug("time to open database", "elapsed", elapsed, "path", dbPath)
	metrics.MeasureSince([]string{"raft_storage", "fsm", "open_db_file"}, start)

	err = boltDB.Update(func(tx *bolt.Tx) error {
		// make sure we have the necessary buckets created
		_, err := tx.CreateBucketIfNotExists(dataBucketName)
		if err != nil {
			return fmt.Errorf("failed to create bucket: %v", err)
		}
		b, err := tx.CreateBucketIfNotExists(configBucketName)
		if err != nil {
			return fmt.Errorf("failed to create bucket: %v", err)
		}

		// Read in our latest index and term and populate it inmemory
		val := b.Get(latestIndexKey)
		if val != nil {
			var latest IndexValue
			err := proto.Unmarshal(val, &latest)
			if err != nil {
				return err
			}

			atomic.StoreUint64(f.latestTerm, latest.Term)
			atomic.StoreUint64(f.latestIndex, latest.Index)
		}

		// Read in our latest config and populate it inmemory
		val = b.Get(latestConfigKey)
		if val != nil {
			var latest ConfigurationValue
			err := proto.Unmarshal(val, &latest)
			if err != nil {
				return err
			}

			f.latestConfig.Store(&latest)
		}
		return nil
	})
	if err != nil {
		return err
	}

	f.db = boltDB
	return nil
}

func (f *FSM) Close() error {
	f.l.RLock()
	defer f.l.RUnlock()

	return f.db.Close()
}

func writeSnapshotMetaToDB(metadata *raft.SnapshotMeta, db *bolt.DB) error {
	latestIndex := &IndexValue{
		Term:  metadata.Term,
		Index: metadata.Index,
	}
	indexBytes, err := proto.Marshal(latestIndex)
	if err != nil {
		return err
	}

	protoConfig := raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration)
	configBytes, err := proto.Marshal(protoConfig)
	if err != nil {
		return err
	}

	err = db.Update(func(tx *bolt.Tx) error {
		b, err := tx.CreateBucketIfNotExists(configBucketName)
		if err != nil {
			return err
		}

		err = b.Put(latestConfigKey, configBytes)
		if err != nil {
			return err
		}

		err = b.Put(latestIndexKey, indexBytes)
		if err != nil {
			return err
		}

		return nil
	})
	if err != nil {
		return err
	}

	return nil
}

func (f *FSM) localNodeConfig() (*LocalNodeConfigValue, error) {
	var configBytes []byte
	if err := f.db.View(func(tx *bolt.Tx) error {
		value := tx.Bucket(configBucketName).Get(localNodeConfigKey)
		if value != nil {
			configBytes = make([]byte, len(value))
			copy(configBytes, value)
		}
		return nil
	}); err != nil {
		return nil, err
	}
	if configBytes == nil {
		return nil, nil
	}

	var lnConfig LocalNodeConfigValue
	if configBytes != nil {
		err := proto.Unmarshal(configBytes, &lnConfig)
		if err != nil {
			return nil, err
		}
		f.desiredSuffrage = lnConfig.DesiredSuffrage
		return &lnConfig, nil
	}

	return nil, nil
}

func (f *FSM) DesiredSuffrage() string {
	f.l.RLock()
	defer f.l.RUnlock()

	return f.desiredSuffrage
}

func (f *FSM) upgradeLocalNodeConfig() error {
	f.l.Lock()
	defer f.l.Unlock()

	// Read the local node config
	lnConfig, err := f.localNodeConfig()
	if err != nil {
		return err
	}

	// Entry is already present. Get the suffrage value.
	if lnConfig != nil {
		f.desiredSuffrage = lnConfig.DesiredSuffrage
		return nil
	}

	//
	// This is the upgrade case where there is no entry
	//

	lnConfig = &LocalNodeConfigValue{}

	// Refer to the persisted latest raft config
	config := f.latestConfig.Load().(*ConfigurationValue)

	// If there is no config, then this is a fresh node coming up. This could end up
	// being a voter or non-voter. But by default assume that this is a voter. It
	// will be changed if this node joins the cluster as a non-voter.
	if config == nil {
		f.desiredSuffrage = "voter"
		lnConfig.DesiredSuffrage = f.desiredSuffrage
		return f.persistDesiredSuffrage(lnConfig)
	}

	// Get the last known suffrage of the node and assume that it is the desired
	// suffrage. There is no better alternative here.
	for _, srv := range config.Servers {
		if srv.Id == f.localID {
			switch srv.Suffrage {
			case int32(raft.Nonvoter):
				lnConfig.DesiredSuffrage = "non-voter"
			default:
				lnConfig.DesiredSuffrage = "voter"
			}
			// Bring the intent to the fsm instance.
			f.desiredSuffrage = lnConfig.DesiredSuffrage
			break
		}
	}

	return f.persistDesiredSuffrage(lnConfig)
}

// recordSuffrage is called when a node successfully joins the cluster. This
// intent should land in the stored configuration. If the config isn't available
// yet, we still go ahead and store the intent in the fsm. During the next
// update to the configuration, this intent will be persisted.
func (f *FSM) recordSuffrage(desiredSuffrage string) error {
	f.l.Lock()
	defer f.l.Unlock()

	if err := f.persistDesiredSuffrage(&LocalNodeConfigValue{
		DesiredSuffrage: desiredSuffrage,
	}); err != nil {
		return err
	}

	f.desiredSuffrage = desiredSuffrage
	return nil
}

func (f *FSM) persistDesiredSuffrage(lnconfig *LocalNodeConfigValue) error {
	dsBytes, err := proto.Marshal(lnconfig)
	if err != nil {
		return err
	}

	return f.db.Update(func(tx *bolt.Tx) error {
		return tx.Bucket(configBucketName).Put(localNodeConfigKey, dsBytes)
	})
}

func (f *FSM) witnessSnapshot(metadata *raft.SnapshotMeta) error {
	f.l.RLock()
	defer f.l.RUnlock()

	err := writeSnapshotMetaToDB(metadata, f.db)
	if err != nil {
		return err
	}

	atomic.StoreUint64(f.latestIndex, metadata.Index)
	atomic.StoreUint64(f.latestTerm, metadata.Term)
	f.latestConfig.Store(raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration))

	return nil
}

// LatestState returns the latest index and configuration values we have seen on
// this FSM.
func (f *FSM) LatestState() (*IndexValue, *ConfigurationValue) {
	return &IndexValue{
		Term:  atomic.LoadUint64(f.latestTerm),
		Index: atomic.LoadUint64(f.latestIndex),
	}, f.latestConfig.Load().(*ConfigurationValue)
}

// Delete deletes the given key from the bolt file.
func (f *FSM) Delete(ctx context.Context, path string) error {
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete"}, time.Now())

	f.l.RLock()
	defer f.l.RUnlock()

	return f.db.Update(func(tx *bolt.Tx) error {
		return tx.Bucket(dataBucketName).Delete([]byte(path))
	})
}

// Delete deletes the given key from the bolt file.
func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error {
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete_prefix"}, time.Now())

	f.l.RLock()
	defer f.l.RUnlock()

	err := f.db.Update(func(tx *bolt.Tx) error {
		// Assume bucket exists and has keys
		c := tx.Bucket(dataBucketName).Cursor()

		prefixBytes := []byte(prefix)
		for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() {
			if err := c.Delete(); err != nil {
				return err
			}
		}

		return nil
	})

	return err
}

// Get retrieves the value at the given path from the bolt file.
func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) {
	// TODO: Remove this outdated metric name in an older release
	defer metrics.MeasureSince([]string{"raft", "get"}, time.Now())
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "get"}, time.Now())

	f.l.RLock()
	defer f.l.RUnlock()

	var valCopy []byte
	var found bool

	err := f.db.View(func(tx *bolt.Tx) error {
		value := tx.Bucket(dataBucketName).Get([]byte(path))
		if value != nil {
			found = true
			valCopy = make([]byte, len(value))
			copy(valCopy, value)
		}

		return nil
	})
	if err != nil {
		return nil, err
	}
	if !found {
		return nil, nil
	}

	return &physical.Entry{
		Key:   path,
		Value: valCopy,
	}, nil
}

// Put writes the given entry to the bolt file.
func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error {
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "put"}, time.Now())

	f.l.RLock()
	defer f.l.RUnlock()

	// Start a write transaction.
	return f.db.Update(func(tx *bolt.Tx) error {
		return tx.Bucket(dataBucketName).Put([]byte(entry.Key), entry.Value)
	})
}

// List retrieves the set of keys with the given prefix from the bolt file.
func (f *FSM) List(ctx context.Context, prefix string) ([]string, error) {
	// TODO: Remove this outdated metric name in a future release
	defer metrics.MeasureSince([]string{"raft", "list"}, time.Now())
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "list"}, time.Now())

	f.l.RLock()
	defer f.l.RUnlock()

	var keys []string

	err := f.db.View(func(tx *bolt.Tx) error {
		// Assume bucket exists and has keys
		c := tx.Bucket(dataBucketName).Cursor()

		prefixBytes := []byte(prefix)
		for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() {
			key := string(k)
			key = strings.TrimPrefix(key, prefix)
			if i := strings.Index(key, "/"); i == -1 {
				// Add objects only from the current 'folder'
				keys = append(keys, key)
			} else {
				// Add truncated 'folder' paths
				if len(keys) == 0 || keys[len(keys)-1] != key[:i+1] {
					keys = append(keys, string(key[:i+1]))
				}
			}
		}

		return nil
	})

	return keys, err
}

// Transaction writes all the operations in the provided transaction to the bolt
// file.
func (f *FSM) Transaction(ctx context.Context, txns []*physical.TxnEntry) error {
	f.l.RLock()
	defer f.l.RUnlock()

	// Start a write transaction.
	err := f.db.Update(func(tx *bolt.Tx) error {
		b := tx.Bucket(dataBucketName)
		for _, txn := range txns {
			var err error
			switch txn.Operation {
			case physical.PutOperation:
				err = b.Put([]byte(txn.Entry.Key), txn.Entry.Value)
			case physical.DeleteOperation:
				err = b.Delete([]byte(txn.Entry.Key))
			default:
				return fmt.Errorf("%q is not a supported transaction operation", txn.Operation)
			}
			if err != nil {
				return err
			}
		}

		return nil
	})

	return err
}

// ApplyBatch will apply a set of logs to the FSM. This is called from the raft
// library.
func (f *FSM) ApplyBatch(logs []*raft.Log) []interface{} {
	numLogs := len(logs)

	if numLogs == 0 {
		return []interface{}{}
	}

	// We will construct one slice per log, each slice containing another slice of results from our get ops
	entrySlices := make([][]*FSMEntry, 0, numLogs)

	// Do the unmarshalling first so we don't hold locks
	var latestConfiguration *ConfigurationValue
	commands := make([]interface{}, 0, numLogs)
	for _, l := range logs {
		switch l.Type {
		case raft.LogCommand:
			command := &LogData{}
			err := proto.Unmarshal(l.Data, command)
			if err != nil {
				f.logger.Error("error proto unmarshaling log data", "error", err)
				panic("error proto unmarshaling log data")
			}
			commands = append(commands, command)
		case raft.LogConfiguration:
			configuration := raft.DecodeConfiguration(l.Data)
			config := raftConfigurationToProtoConfiguration(l.Index, configuration)

			commands = append(commands, config)

			// Update the latest configuration the fsm has received; we will
			// store this after it has been committed to storage.
			latestConfiguration = config

		default:
			panic(fmt.Sprintf("got unexpected log type: %d", l.Type))
		}
	}

	// Only advance latest pointer if this log has a higher index value than
	// what we have seen in the past.
	var logIndex []byte
	var err error
	latestIndex, _ := f.LatestState()
	lastLog := logs[numLogs-1]
	if latestIndex.Index < lastLog.Index {
		logIndex, err = proto.Marshal(&IndexValue{
			Term:  lastLog.Term,
			Index: lastLog.Index,
		})
		if err != nil {
			f.logger.Error("unable to marshal latest index", "error", err)
			panic("unable to marshal latest index")
		}
	}

	f.l.RLock()
	defer f.l.RUnlock()

	if f.applyCallback != nil {
		f.applyCallback()
	}

	err = f.db.Update(func(tx *bolt.Tx) error {
		b := tx.Bucket(dataBucketName)
		for _, commandRaw := range commands {
			entrySlice := make([]*FSMEntry, 0)
			switch command := commandRaw.(type) {
			case *LogData:
				for _, op := range command.Operations {
					var err error
					switch op.OpType {
					case putOp:
						err = b.Put([]byte(op.Key), op.Value)
					case deleteOp:
						err = b.Delete([]byte(op.Key))
					case getOp:
						fsmEntry := &FSMEntry{
							Key: op.Key,
						}
						val := b.Get([]byte(op.Key))
						if len(val) > 0 {
							newVal := make([]byte, len(val))
							copy(newVal, val)
							fsmEntry.Value = newVal
						}
						entrySlice = append(entrySlice, fsmEntry)
					case restoreCallbackOp:
						if f.restoreCb != nil {
							// Kick off the restore callback function in a go routine
							go f.restoreCb(context.Background())
						}
					default:
						if _, ok := f.unknownOpTypes.Load(op.OpType); !ok {
							f.logger.Error("unsupported transaction operation", "op", op.OpType)
							f.unknownOpTypes.Store(op.OpType, struct{}{})
						}
					}
					if err != nil {
						return err
					}
				}

			case *ConfigurationValue:
				b := tx.Bucket(configBucketName)
				configBytes, err := proto.Marshal(command)
				if err != nil {
					return err
				}
				if err := b.Put(latestConfigKey, configBytes); err != nil {
					return err
				}
			}

			entrySlices = append(entrySlices, entrySlice)
		}

		if len(logIndex) > 0 {
			b := tx.Bucket(configBucketName)
			err = b.Put(latestIndexKey, logIndex)
			if err != nil {
				return err
			}
		}

		return nil
	})
	if err != nil {
		f.logger.Error("failed to store data", "error", err)
		panic("failed to store data")
	}

	// If we advanced the latest value, update the in-memory representation too.
	if len(logIndex) > 0 {
		atomic.StoreUint64(f.latestTerm, lastLog.Term)
		atomic.StoreUint64(f.latestIndex, lastLog.Index)
	}

	// If one or more configuration changes were processed, store the latest one.
	if latestConfiguration != nil {
		f.latestConfig.Store(latestConfiguration)
	}

	// Build the responses. The logs array is used here to ensure we reply to
	// all command values; even if they are not of the types we expect. This
	// should futureproof this function from more log types being provided.
	resp := make([]interface{}, numLogs)
	for i := range logs {
		resp[i] = &FSMApplyResponse{
			Success:    true,
			EntrySlice: entrySlices[i],
		}
	}

	return resp
}

// Apply will apply a log value to the FSM. This is called from the raft
// library.
func (f *FSM) Apply(log *raft.Log) interface{} {
	return f.ApplyBatch([]*raft.Log{log})[0]
}

type writeErrorCloser interface {
	io.WriteCloser
	CloseWithError(error) error
}

// writeTo will copy the FSM's content to a remote sink. The data is written
// twice, once for use in determining various metadata attributes of the dataset
// (size, checksum, etc) and a second for the sink of the data. We also use a
// proto delimited writer so we can stream proto messages to the sink.
func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink writeErrorCloser) {
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "write_snapshot"}, time.Now())

	protoWriter := NewDelimitedWriter(sink)
	metadataProtoWriter := NewDelimitedWriter(metaSink)

	f.l.RLock()
	defer f.l.RUnlock()

	err := f.db.View(func(tx *bolt.Tx) error {
		b := tx.Bucket(dataBucketName)

		c := b.Cursor()

		// Do the first scan of the data for metadata purposes.
		for k, v := c.First(); k != nil; k, v = c.Next() {
			err := metadataProtoWriter.WriteMsg(&pb.StorageEntry{
				Key:   string(k),
				Value: v,
			})
			if err != nil {
				metaSink.CloseWithError(err)
				return err
			}
		}
		metaSink.Close()

		// Do the second scan for copy purposes.
		for k, v := c.First(); k != nil; k, v = c.Next() {
			err := protoWriter.WriteMsg(&pb.StorageEntry{
				Key:   string(k),
				Value: v,
			})
			if err != nil {
				return err
			}
		}

		return nil
	})
	sink.CloseWithError(err)
}

// Snapshot implements the FSM interface. It returns a noop snapshot object.
func (f *FSM) Snapshot() (raft.FSMSnapshot, error) {
	return &noopSnapshotter{
		fsm: f,
	}, nil
}

// SetNoopRestore is used to disable restore operations on raft startup. Because
// we are using persistent storage in our FSM we do not need to issue a restore
// on startup.
func (f *FSM) SetNoopRestore(enabled bool) {
	f.l.Lock()
	f.noopRestore = enabled
	f.l.Unlock()
}

// Restore installs a new snapshot from the provided reader. It does an atomic
// rename of the snapshot file into the database filepath. While a restore is
// happening the FSM is locked and no writes or reads can be performed.
func (f *FSM) Restore(r io.ReadCloser) error {
	defer metrics.MeasureSince([]string{"raft_storage", "fsm", "restore_snapshot"}, time.Now())

	if f.noopRestore {
		return nil
	}

	snapshotInstaller, ok := r.(*boltSnapshotInstaller)
	if !ok {
		wrapper, ok := r.(raft.ReadCloserWrapper)
		if !ok {
			return fmt.Errorf("expected ReadCloserWrapper object, got: %T", r)
		}
		snapshotInstallerRaw := wrapper.WrappedReadCloser()
		snapshotInstaller, ok = snapshotInstallerRaw.(*boltSnapshotInstaller)
		if !ok {
			return fmt.Errorf("expected snapshot installer object, got: %T", snapshotInstallerRaw)
		}
	}

	f.l.Lock()
	defer f.l.Unlock()

	// Cache the local node config before closing the db file
	lnConfig, err := f.localNodeConfig()
	if err != nil {
		return err
	}

	// Close the db file
	if err := f.db.Close(); err != nil {
		f.logger.Error("failed to close database file", "error", err)
		return err
	}

	dbPath := filepath.Join(f.path, databaseFilename)

	f.logger.Info("installing snapshot to FSM")

	// Install the new boltdb file
	var retErr *multierror.Error
	if err := snapshotInstaller.Install(dbPath); err != nil {
		f.logger.Error("failed to install snapshot", "error", err)
		retErr = multierror.Append(retErr, fmt.Errorf("failed to install snapshot database: %w", err))
	} else {
		f.logger.Info("snapshot installed")
	}

	// Open the db file. We want to do this regardless of if the above install
	// worked. If the install failed we should try to open the old DB file.
	if err := f.openDBFile(dbPath); err != nil {
		f.logger.Error("failed to open new database file", "error", err)
		retErr = multierror.Append(retErr, fmt.Errorf("failed to open new bolt file: %w", err))
	}

	// Handle local node config restore. lnConfig should not be nil here, but
	// adding the nil check anyways for safety.
	if lnConfig != nil {
		// Persist the local node config on the restored fsm.
		if err := f.persistDesiredSuffrage(lnConfig); err != nil {
			f.logger.Error("failed to persist local node config from before the restore", "error", err)
			retErr = multierror.Append(retErr, fmt.Errorf("failed to persist local node config from before the restore: %w", err))
		}
	}

	return retErr.ErrorOrNil()
}

// noopSnapshotter implements the fsm.Snapshot interface. It doesn't do anything
// since our SnapshotStore reads data out of the FSM on Open().
type noopSnapshotter struct {
	fsm *FSM
}

// Persist implements the fsm.Snapshot interface. It doesn't need to persist any
// state data, but it does persist the raft metadata. This is necessary so we
// can be sure to capture indexes for operation types that are not sent to the
// FSM.
func (s *noopSnapshotter) Persist(sink raft.SnapshotSink) error {
	boltSnapshotSink := sink.(*BoltSnapshotSink)

	// We are processing a snapshot, fastforward the index, term, and
	// configuration to the latest seen by the raft system.
	if err := s.fsm.witnessSnapshot(&boltSnapshotSink.meta); err != nil {
		return err
	}

	return nil
}

// Release doesn't do anything.
func (s *noopSnapshotter) Release() {}

// raftConfigurationToProtoConfiguration converts a raft configuration object to
// a proto value.
func raftConfigurationToProtoConfiguration(index uint64, configuration raft.Configuration) *ConfigurationValue {
	servers := make([]*Server, len(configuration.Servers))
	for i, s := range configuration.Servers {
		servers[i] = &Server{
			Suffrage: int32(s.Suffrage),
			Id:       string(s.ID),
			Address:  string(s.Address),
		}
	}
	return &ConfigurationValue{
		Index:   index,
		Servers: servers,
	}
}

// protoConfigurationToRaftConfiguration converts a proto configuration object
// to a raft object.
func protoConfigurationToRaftConfiguration(configuration *ConfigurationValue) (uint64, raft.Configuration) {
	servers := make([]raft.Server, len(configuration.Servers))
	for i, s := range configuration.Servers {
		servers[i] = raft.Server{
			Suffrage: raft.ServerSuffrage(s.Suffrage),
			ID:       raft.ServerID(s.Id),
			Address:  raft.ServerAddress(s.Address),
		}
	}
	return configuration.Index, raft.Configuration{
		Servers: servers,
	}
}

type FSMChunkStorage struct {
	f   *FSM
	ctx context.Context
}

// chunkPaths returns a disk prefix and key given chunkinfo
func (f *FSMChunkStorage) chunkPaths(chunk *raftchunking.ChunkInfo) (string, string) {
	prefix := fmt.Sprintf("%s%d/", chunkingPrefix, chunk.OpNum)
	key := fmt.Sprintf("%s%d", prefix, chunk.SequenceNum)
	return prefix, key
}

func (f *FSMChunkStorage) StoreChunk(chunk *raftchunking.ChunkInfo) (bool, error) {
	b, err := jsonutil.EncodeJSON(chunk)
	if err != nil {
		return false, fmt.Errorf("error encoding chunk info: %w", err)
	}

	prefix, key := f.chunkPaths(chunk)

	entry := &physical.Entry{
		Key:   key,
		Value: b,
	}

	f.f.l.RLock()
	defer f.f.l.RUnlock()

	// Start a write transaction.
	done := new(bool)
	if err := f.f.db.Update(func(tx *bolt.Tx) error {
		if err := tx.Bucket(dataBucketName).Put([]byte(entry.Key), entry.Value); err != nil {
			return fmt.Errorf("error storing chunk info: %w", err)
		}

		// Assume bucket exists and has keys
		c := tx.Bucket(dataBucketName).Cursor()

		var keys []string
		prefixBytes := []byte(prefix)
		for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() {
			key := string(k)
			key = strings.TrimPrefix(key, prefix)
			if i := strings.Index(key, "/"); i == -1 {
				// Add objects only from the current 'folder'
				keys = append(keys, key)
			} else {
				// Add truncated 'folder' paths
				keys = strutil.AppendIfMissing(keys, string(key[:i+1]))
			}
		}

		*done = uint32(len(keys)) == chunk.NumChunks

		return nil
	}); err != nil {
		return false, err
	}

	return *done, nil
}

func (f *FSMChunkStorage) FinalizeOp(opNum uint64) ([]*raftchunking.ChunkInfo, error) {
	ret, err := f.chunksForOpNum(opNum)
	if err != nil {
		return nil, fmt.Errorf("error getting chunks for op keys: %w", err)
	}

	prefix, _ := f.chunkPaths(&raftchunking.ChunkInfo{OpNum: opNum})
	if err := f.f.DeletePrefix(f.ctx, prefix); err != nil {
		return nil, fmt.Errorf("error deleting prefix after op finalization: %w", err)
	}

	return ret, nil
}

func (f *FSMChunkStorage) chunksForOpNum(opNum uint64) ([]*raftchunking.ChunkInfo, error) {
	prefix, _ := f.chunkPaths(&raftchunking.ChunkInfo{OpNum: opNum})

	opChunkKeys, err := f.f.List(f.ctx, prefix)
	if err != nil {
		return nil, fmt.Errorf("error fetching op chunk keys: %w", err)
	}

	if len(opChunkKeys) == 0 {
		return nil, nil
	}

	var ret []*raftchunking.ChunkInfo

	for _, v := range opChunkKeys {
		seqNum, err := strconv.ParseInt(v, 10, 64)
		if err != nil {
			return nil, fmt.Errorf("error converting seqnum to integer: %w", err)
		}

		entry, err := f.f.Get(f.ctx, prefix+v)
		if err != nil {
			return nil, fmt.Errorf("error fetching chunkinfo: %w", err)
		}

		var ci raftchunking.ChunkInfo
		if err := jsonutil.DecodeJSON(entry.Value, &ci); err != nil {
			return nil, fmt.Errorf("error decoding chunkinfo json: %w", err)
		}

		if ret == nil {
			ret = make([]*raftchunking.ChunkInfo, ci.NumChunks)
		}

		ret[seqNum] = &ci
	}

	return ret, nil
}

func (f *FSMChunkStorage) GetChunks() (raftchunking.ChunkMap, error) {
	opNums, err := f.f.List(f.ctx, chunkingPrefix)
	if err != nil {
		return nil, fmt.Errorf("error doing recursive list for chunk saving: %w", err)
	}

	if len(opNums) == 0 {
		return nil, nil
	}

	ret := make(raftchunking.ChunkMap, len(opNums))
	for _, opNumStr := range opNums {
		opNum, err := strconv.ParseInt(opNumStr, 10, 64)
		if err != nil {
			return nil, fmt.Errorf("error parsing op num during chunk saving: %w", err)
		}

		opChunks, err := f.chunksForOpNum(uint64(opNum))
		if err != nil {
			return nil, fmt.Errorf("error getting chunks for op keys during chunk saving: %w", err)
		}

		ret[uint64(opNum)] = opChunks
	}

	return ret, nil
}

func (f *FSMChunkStorage) RestoreChunks(chunks raftchunking.ChunkMap) error {
	if err := f.f.DeletePrefix(f.ctx, chunkingPrefix); err != nil {
		return fmt.Errorf("error deleting prefix for chunk restoration: %w", err)
	}
	if len(chunks) == 0 {
		return nil
	}

	for opNum, opChunks := range chunks {
		for _, chunk := range opChunks {
			if chunk == nil {
				continue
			}
			if chunk.OpNum != opNum {
				return errors.New("unexpected op number in chunk")
			}
			if _, err := f.StoreChunk(chunk); err != nil {
				return fmt.Errorf("error storing chunk during restoration: %w", err)
			}
		}
	}

	return nil
}