package raft import ( "bytes" "context" "errors" "fmt" "io" "path/filepath" "strconv" "strings" "sync" "sync/atomic" "time" metrics "github.com/armon/go-metrics" "github.com/golang/protobuf/proto" "github.com/hashicorp/errwrap" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-raftchunking" "github.com/hashicorp/raft" "github.com/hashicorp/vault/sdk/helper/jsonutil" "github.com/hashicorp/vault/sdk/helper/strutil" "github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/sdk/plugin/pb" bolt "go.etcd.io/bbolt" ) const ( deleteOp uint32 = 1 << iota putOp restoreCallbackOp chunkingPrefix = "raftchunking/" databaseFilename = "vault.db" ) var ( // dataBucketName is the value we use for the bucket dataBucketName = []byte("data") configBucketName = []byte("config") latestIndexKey = []byte("latest_indexes") latestConfigKey = []byte("latest_config") ) // Verify FSM satisfies the correct interfaces var _ physical.Backend = (*FSM)(nil) var _ physical.Transactional = (*FSM)(nil) var _ raft.FSM = (*FSM)(nil) var _ raft.BatchingFSM = (*FSM)(nil) type restoreCallback func(context.Context) error // FSMApplyResponse is returned from an FSM apply. It indicates if the apply was // successful or not. type FSMApplyResponse struct { Success bool } // FSM is Vault's primary state storage. It writes updates to an bolt db file // that lives on local disk. FSM implements raft.FSM and physical.Backend // interfaces. type FSM struct { // latestIndex and latestTerm must stay at the top of this struct to be // properly 64-bit aligned. // latestIndex and latestTerm are the term and index of the last log we // received latestIndex *uint64 latestTerm *uint64 // latestConfig is the latest server configuration we've seen latestConfig atomic.Value l sync.RWMutex path string logger log.Logger noopRestore bool // applyDelay is used to simulate a slow apply in tests applyDelay time.Duration db *bolt.DB // retoreCb is called after we've restored a snapshot restoreCb restoreCallback chunker *raftchunking.ChunkingBatchingFSM } // NewFSM constructs a FSM using the given directory func NewFSM(path string, logger log.Logger) (*FSM, error) { // Initialize the latest term, index, and config values latestTerm := new(uint64) latestIndex := new(uint64) latestConfig := atomic.Value{} atomic.StoreUint64(latestTerm, 0) atomic.StoreUint64(latestIndex, 0) latestConfig.Store((*ConfigurationValue)(nil)) f := &FSM{ path: path, logger: logger, latestTerm: latestTerm, latestIndex: latestIndex, latestConfig: latestConfig, } f.chunker = raftchunking.NewChunkingBatchingFSM(f, &FSMChunkStorage{ f: f, ctx: context.Background(), }) dbPath := filepath.Join(path, databaseFilename) if err := f.openDBFile(dbPath); err != nil { return nil, errwrap.Wrapf("failed to open bolt file: {{err}}", err) } return f, nil } func (f *FSM) getDB() *bolt.DB { f.l.RLock() defer f.l.RUnlock() return f.db } // SetFSMDelay adds a delay to the FSM apply. This is used in tests to simulate // a slow apply. func (r *RaftBackend) SetFSMDelay(delay time.Duration) { r.fsm.l.Lock() r.fsm.applyDelay = delay r.fsm.l.Unlock() } func (f *FSM) openDBFile(dbPath string) error { if len(dbPath) == 0 { return errors.New("can not open empty filename") } boltDB, err := bolt.Open(dbPath, 0666, &bolt.Options{Timeout: 1 * time.Second}) if err != nil { return err } err = boltDB.Update(func(tx *bolt.Tx) error { // make sure we have the necessary buckets created _, err := tx.CreateBucketIfNotExists(dataBucketName) if err != nil { return fmt.Errorf("failed to create bucket: %v", err) } b, err := tx.CreateBucketIfNotExists(configBucketName) if err != nil { return fmt.Errorf("failed to create bucket: %v", err) } // Read in our latest index and term and populate it inmemory val := b.Get(latestIndexKey) if val != nil { var latest IndexValue err := proto.Unmarshal(val, &latest) if err != nil { return err } atomic.StoreUint64(f.latestTerm, latest.Term) atomic.StoreUint64(f.latestIndex, latest.Index) } // Read in our latest config and populate it inmemory val = b.Get(latestConfigKey) if val != nil { var latest ConfigurationValue err := proto.Unmarshal(val, &latest) if err != nil { return err } f.latestConfig.Store(&latest) } return nil }) if err != nil { return err } f.db = boltDB return nil } func (f *FSM) Close() error { f.l.RLock() defer f.l.RUnlock() return f.db.Close() } func writeSnapshotMetaToDB(metadata *raft.SnapshotMeta, db *bolt.DB) error { latestIndex := &IndexValue{ Term: metadata.Term, Index: metadata.Index, } indexBytes, err := proto.Marshal(latestIndex) if err != nil { return err } protoConfig := raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration) configBytes, err := proto.Marshal(protoConfig) if err != nil { return err } err = db.Update(func(tx *bolt.Tx) error { b, err := tx.CreateBucketIfNotExists(configBucketName) if err != nil { return err } err = b.Put(latestConfigKey, configBytes) if err != nil { return err } err = b.Put(latestIndexKey, indexBytes) if err != nil { return err } return nil }) if err != nil { return err } return nil } func (f *FSM) witnessSnapshot(metadata *raft.SnapshotMeta) error { f.l.RLock() defer f.l.RUnlock() err := writeSnapshotMetaToDB(metadata, f.db) if err != nil { return err } atomic.StoreUint64(f.latestIndex, metadata.Index) atomic.StoreUint64(f.latestTerm, metadata.Term) f.latestConfig.Store(raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration)) return nil } // LatestState returns the latest index and configuration values we have seen on // this FSM. func (f *FSM) LatestState() (*IndexValue, *ConfigurationValue) { return &IndexValue{ Term: atomic.LoadUint64(f.latestTerm), Index: atomic.LoadUint64(f.latestIndex), }, f.latestConfig.Load().(*ConfigurationValue) } // Delete deletes the given key from the bolt file. func (f *FSM) Delete(ctx context.Context, path string) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete"}, time.Now()) f.l.RLock() defer f.l.RUnlock() return f.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(dataBucketName).Delete([]byte(path)) }) } // Delete deletes the given key from the bolt file. func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete_prefix"}, time.Now()) f.l.RLock() defer f.l.RUnlock() err := f.db.Update(func(tx *bolt.Tx) error { // Assume bucket exists and has keys c := tx.Bucket(dataBucketName).Cursor() prefixBytes := []byte(prefix) for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() { if err := c.Delete(); err != nil { return err } } return nil }) return err } // Get retrieves the value at the given path from the bolt file. func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) { // TODO: Remove this outdated metric name in an older release defer metrics.MeasureSince([]string{"raft", "get"}, time.Now()) defer metrics.MeasureSince([]string{"raft_storage", "fsm", "get"}, time.Now()) f.l.RLock() defer f.l.RUnlock() var valCopy []byte var found bool err := f.db.View(func(tx *bolt.Tx) error { value := tx.Bucket(dataBucketName).Get([]byte(path)) if value != nil { found = true valCopy = make([]byte, len(value)) copy(valCopy, value) } return nil }) if err != nil { return nil, err } if !found { return nil, nil } return &physical.Entry{ Key: path, Value: valCopy, }, nil } // Put writes the given entry to the bolt file. func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "put"}, time.Now()) f.l.RLock() defer f.l.RUnlock() // Start a write transaction. return f.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(dataBucketName).Put([]byte(entry.Key), entry.Value) }) } // List retrieves the set of keys with the given prefix from the bolt file. func (f *FSM) List(ctx context.Context, prefix string) ([]string, error) { // TODO: Remove this outdated metric name in a future release defer metrics.MeasureSince([]string{"raft", "list"}, time.Now()) defer metrics.MeasureSince([]string{"raft_storage", "fsm", "list"}, time.Now()) f.l.RLock() defer f.l.RUnlock() var keys []string err := f.db.View(func(tx *bolt.Tx) error { // Assume bucket exists and has keys c := tx.Bucket(dataBucketName).Cursor() prefixBytes := []byte(prefix) for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() { key := string(k) key = strings.TrimPrefix(key, prefix) if i := strings.Index(key, "/"); i == -1 { // Add objects only from the current 'folder' keys = append(keys, key) } else { // Add truncated 'folder' paths if len(keys) == 0 || keys[len(keys)-1] != key[:i+1] { keys = append(keys, string(key[:i+1])) } } } return nil }) return keys, err } // Transaction writes all the operations in the provided transaction to the bolt // file. func (f *FSM) Transaction(ctx context.Context, txns []*physical.TxnEntry) error { f.l.RLock() defer f.l.RUnlock() // Start a write transaction. err := f.db.Update(func(tx *bolt.Tx) error { b := tx.Bucket(dataBucketName) for _, txn := range txns { var err error switch txn.Operation { case physical.PutOperation: err = b.Put([]byte(txn.Entry.Key), txn.Entry.Value) case physical.DeleteOperation: err = b.Delete([]byte(txn.Entry.Key)) default: return fmt.Errorf("%q is not a supported transaction operation", txn.Operation) } if err != nil { return err } } return nil }) return err } // ApplyBatch will apply a set of logs to the FSM. This is called from the raft // library. func (f *FSM) ApplyBatch(logs []*raft.Log) []interface{} { if len(logs) == 0 { return []interface{}{} } // Do the unmarshalling first so we don't hold locks var latestConfiguration *ConfigurationValue commands := make([]interface{}, 0, len(logs)) for _, log := range logs { switch log.Type { case raft.LogCommand: command := &LogData{} err := proto.Unmarshal(log.Data, command) if err != nil { f.logger.Error("error proto unmarshaling log data", "error", err) panic("error proto unmarshaling log data") } commands = append(commands, command) case raft.LogConfiguration: configuration := raft.DecodeConfiguration(log.Data) config := raftConfigurationToProtoConfiguration(log.Index, configuration) commands = append(commands, config) // Update the latest configuration the fsm has received; we will // store this after it has been committed to storage. latestConfiguration = config default: panic(fmt.Sprintf("got unexpected log type: %d", log.Type)) } } // Only advance latest pointer if this log has a higher index value than // what we have seen in the past. var logIndex []byte var err error latestIndex, _ := f.LatestState() lastLog := logs[len(logs)-1] if latestIndex.Index < lastLog.Index { logIndex, err = proto.Marshal(&IndexValue{ Term: lastLog.Term, Index: lastLog.Index, }) if err != nil { f.logger.Error("unable to marshal latest index", "error", err) panic("unable to marshal latest index") } } f.l.RLock() defer f.l.RUnlock() if f.applyDelay > 0 { time.Sleep(f.applyDelay) } err = f.db.Update(func(tx *bolt.Tx) error { b := tx.Bucket(dataBucketName) for _, commandRaw := range commands { switch command := commandRaw.(type) { case *LogData: for _, op := range command.Operations { var err error switch op.OpType { case putOp: err = b.Put([]byte(op.Key), op.Value) case deleteOp: err = b.Delete([]byte(op.Key)) case restoreCallbackOp: if f.restoreCb != nil { // Kick off the restore callback function in a go routine go f.restoreCb(context.Background()) } default: return fmt.Errorf("%q is not a supported transaction operation", op.OpType) } if err != nil { return err } } case *ConfigurationValue: b := tx.Bucket(configBucketName) configBytes, err := proto.Marshal(command) if err != nil { return err } if err := b.Put(latestConfigKey, configBytes); err != nil { return err } } } if len(logIndex) > 0 { b := tx.Bucket(configBucketName) err = b.Put(latestIndexKey, logIndex) if err != nil { return err } } return nil }) if err != nil { f.logger.Error("failed to store data", "error", err) panic("failed to store data") } // If we advanced the latest value, update the in-memory representation too. if len(logIndex) > 0 { atomic.StoreUint64(f.latestTerm, lastLog.Term) atomic.StoreUint64(f.latestIndex, lastLog.Index) } // If one or more configuration changes were processed, store the latest one. if latestConfiguration != nil { f.latestConfig.Store(latestConfiguration) } // Build the responses. The logs array is used here to ensure we reply to // all command values; even if they are not of the types we expect. This // should future proof this function from more log types being provided. resp := make([]interface{}, len(logs)) for i := range logs { resp[i] = &FSMApplyResponse{ Success: true, } } return resp } // Apply will apply a log value to the FSM. This is called from the raft // library. func (f *FSM) Apply(log *raft.Log) interface{} { return f.ApplyBatch([]*raft.Log{log})[0] } type writeErrorCloser interface { io.WriteCloser CloseWithError(error) error } // writeTo will copy the FSM's content to a remote sink. The data is written // twice, once for use in determining various metadata attributes of the dataset // (size, checksum, etc) and a second for the sink of the data. We also use a // proto delimited writer so we can stream proto messages to the sink. func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink writeErrorCloser) { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "write_snapshot"}, time.Now()) protoWriter := NewDelimitedWriter(sink) metadataProtoWriter := NewDelimitedWriter(metaSink) f.l.RLock() defer f.l.RUnlock() err := f.db.View(func(tx *bolt.Tx) error { b := tx.Bucket(dataBucketName) c := b.Cursor() // Do the first scan of the data for metadata purposes. for k, v := c.First(); k != nil; k, v = c.Next() { err := metadataProtoWriter.WriteMsg(&pb.StorageEntry{ Key: string(k), Value: v, }) if err != nil { metaSink.CloseWithError(err) return err } } metaSink.Close() // Do the second scan for copy purposes. for k, v := c.First(); k != nil; k, v = c.Next() { err := protoWriter.WriteMsg(&pb.StorageEntry{ Key: string(k), Value: v, }) if err != nil { return err } } return nil }) sink.CloseWithError(err) } // Snapshot implements the FSM interface. It returns a noop snapshot object. func (f *FSM) Snapshot() (raft.FSMSnapshot, error) { return &noopSnapshotter{ fsm: f, }, nil } // SetNoopRestore is used to disable restore operations on raft startup. Because // we are using persistent storage in our FSM we do not need to issue a restore // on startup. func (f *FSM) SetNoopRestore(enabled bool) { f.l.Lock() f.noopRestore = enabled f.l.Unlock() } // Restore installs a new snapshot from the provided reader. It does an atomic // rename of the snapshot file into the database filepath. While a restore is // happening the FSM is locked and no writes or reads can be performed. func (f *FSM) Restore(r io.ReadCloser) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "restore_snapshot"}, time.Now()) if f.noopRestore == true { return nil } snapshotInstaller, ok := r.(*boltSnapshotInstaller) if !ok { return errors.New("expected snapshot installer object") } f.l.Lock() defer f.l.Unlock() // Close the db file if err := f.db.Close(); err != nil { f.logger.Error("failed to close database file", "error", err) return err } dbPath := filepath.Join(f.path, databaseFilename) f.logger.Info("installing snapshot to FSM") // Install the new boltdb file var retErr *multierror.Error if err := snapshotInstaller.Install(dbPath); err != nil { f.logger.Error("failed to install snapshot", "error", err) retErr = multierror.Append(retErr, errwrap.Wrapf("failed to install snapshot database: {{err}}", err)) } else { f.logger.Info("snapshot installed") } // Open the db file. We want to do this regardless of if the above install // worked. If the install failed we should try to open the old DB file. if err := f.openDBFile(dbPath); err != nil { f.logger.Error("failed to open new database file", "error", err) retErr = multierror.Append(retErr, errwrap.Wrapf("failed to open new bolt file: {{err}}", err)) } return retErr.ErrorOrNil() } // noopSnapshotter implements the fsm.Snapshot interface. It doesn't do anything // since our SnapshotStore reads data out of the FSM on Open(). type noopSnapshotter struct { fsm *FSM } // Persist implements the fsm.Snapshot interface. It doesn't need to persist any // state data, but it does persist the raft metadata. This is necessary so we // can be sure to capture indexes for operation types that are not sent to the // FSM. func (s *noopSnapshotter) Persist(sink raft.SnapshotSink) error { boltSnapshotSink := sink.(*BoltSnapshotSink) // We are processing a snapshot, fastforward the index, term, and // configuration to the latest seen by the raft system. if err := s.fsm.witnessSnapshot(&boltSnapshotSink.meta); err != nil { return err } return nil } // Release doesn't do anything. func (s *noopSnapshotter) Release() {} // raftConfigurationToProtoConfiguration converts a raft configuration object to // a proto value. func raftConfigurationToProtoConfiguration(index uint64, configuration raft.Configuration) *ConfigurationValue { servers := make([]*Server, len(configuration.Servers)) for i, s := range configuration.Servers { servers[i] = &Server{ Suffrage: int32(s.Suffrage), Id: string(s.ID), Address: string(s.Address), } } return &ConfigurationValue{ Index: index, Servers: servers, } } // protoConfigurationToRaftConfiguration converts a proto configuration object // to a raft object. func protoConfigurationToRaftConfiguration(configuration *ConfigurationValue) (uint64, raft.Configuration) { servers := make([]raft.Server, len(configuration.Servers)) for i, s := range configuration.Servers { servers[i] = raft.Server{ Suffrage: raft.ServerSuffrage(s.Suffrage), ID: raft.ServerID(s.Id), Address: raft.ServerAddress(s.Address), } } return configuration.Index, raft.Configuration{ Servers: servers, } } type FSMChunkStorage struct { f *FSM ctx context.Context } // chunkPaths returns a disk prefix and key given chunkinfo func (f *FSMChunkStorage) chunkPaths(chunk *raftchunking.ChunkInfo) (string, string) { prefix := fmt.Sprintf("%s%d/", chunkingPrefix, chunk.OpNum) key := fmt.Sprintf("%s%d", prefix, chunk.SequenceNum) return prefix, key } func (f *FSMChunkStorage) StoreChunk(chunk *raftchunking.ChunkInfo) (bool, error) { b, err := jsonutil.EncodeJSON(chunk) if err != nil { return false, errwrap.Wrapf("error encoding chunk info: {{err}}", err) } prefix, key := f.chunkPaths(chunk) entry := &physical.Entry{ Key: key, Value: b, } f.f.l.RLock() defer f.f.l.RUnlock() // Start a write transaction. done := new(bool) if err := f.f.db.Update(func(tx *bolt.Tx) error { if err := tx.Bucket(dataBucketName).Put([]byte(entry.Key), entry.Value); err != nil { return errwrap.Wrapf("error storing chunk info: {{err}}", err) } // Assume bucket exists and has keys c := tx.Bucket(dataBucketName).Cursor() var keys []string prefixBytes := []byte(prefix) for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() { key := string(k) key = strings.TrimPrefix(key, prefix) if i := strings.Index(key, "/"); i == -1 { // Add objects only from the current 'folder' keys = append(keys, key) } else { // Add truncated 'folder' paths keys = strutil.AppendIfMissing(keys, string(key[:i+1])) } } *done = uint32(len(keys)) == chunk.NumChunks return nil }); err != nil { return false, err } return *done, nil } func (f *FSMChunkStorage) FinalizeOp(opNum uint64) ([]*raftchunking.ChunkInfo, error) { ret, err := f.chunksForOpNum(opNum) if err != nil { return nil, errwrap.Wrapf("error getting chunks for op keys: {{err}}", err) } prefix, _ := f.chunkPaths(&raftchunking.ChunkInfo{OpNum: opNum}) if err := f.f.DeletePrefix(f.ctx, prefix); err != nil { return nil, errwrap.Wrapf("error deleting prefix after op finalization: {{err}}", err) } return ret, nil } func (f *FSMChunkStorage) chunksForOpNum(opNum uint64) ([]*raftchunking.ChunkInfo, error) { prefix, _ := f.chunkPaths(&raftchunking.ChunkInfo{OpNum: opNum}) opChunkKeys, err := f.f.List(f.ctx, prefix) if err != nil { return nil, errwrap.Wrapf("error fetching op chunk keys: {{err}}", err) } if len(opChunkKeys) == 0 { return nil, nil } var ret []*raftchunking.ChunkInfo for _, v := range opChunkKeys { seqNum, err := strconv.ParseInt(v, 10, 64) if err != nil { return nil, errwrap.Wrapf("error converting seqnum to integer: {{err}}", err) } entry, err := f.f.Get(f.ctx, prefix+v) if err != nil { return nil, errwrap.Wrapf("error fetching chunkinfo: {{err}}", err) } var ci raftchunking.ChunkInfo if err := jsonutil.DecodeJSON(entry.Value, &ci); err != nil { return nil, errwrap.Wrapf("error decoding chunkinfo json: {{err}}", err) } if ret == nil { ret = make([]*raftchunking.ChunkInfo, ci.NumChunks) } ret[seqNum] = &ci } return ret, nil } func (f *FSMChunkStorage) GetChunks() (raftchunking.ChunkMap, error) { opNums, err := f.f.List(f.ctx, chunkingPrefix) if err != nil { return nil, errwrap.Wrapf("error doing recursive list for chunk saving: {{err}}", err) } if len(opNums) == 0 { return nil, nil } ret := make(raftchunking.ChunkMap, len(opNums)) for _, opNumStr := range opNums { opNum, err := strconv.ParseInt(opNumStr, 10, 64) if err != nil { return nil, errwrap.Wrapf("error parsing op num during chunk saving: {{err}}", err) } opChunks, err := f.chunksForOpNum(uint64(opNum)) if err != nil { return nil, errwrap.Wrapf("error getting chunks for op keys during chunk saving: {{err}}", err) } ret[uint64(opNum)] = opChunks } return ret, nil } func (f *FSMChunkStorage) RestoreChunks(chunks raftchunking.ChunkMap) error { if err := f.f.DeletePrefix(f.ctx, chunkingPrefix); err != nil { return errwrap.Wrapf("error deleting prefix for chunk restoration: {{err}}", err) } if len(chunks) == 0 { return nil } for opNum, opChunks := range chunks { for _, chunk := range opChunks { if chunk == nil { continue } if chunk.OpNum != opNum { return errors.New("unexpected op number in chunk") } if _, err := f.StoreChunk(chunk); err != nil { return errwrap.Wrapf("error storing chunk during restoration: {{err}}", err) } } } return nil }