package raft import ( "bytes" "context" "encoding/hex" "errors" "fmt" "io" "os" "path/filepath" "strconv" "strings" "sync" "sync/atomic" "time" "github.com/armon/go-metrics" "github.com/golang/protobuf/proto" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-raftchunking" "github.com/hashicorp/go-secure-stdlib/strutil" "github.com/hashicorp/raft" "github.com/hashicorp/vault/sdk/helper/jsonutil" "github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/sdk/plugin/pb" bolt "go.etcd.io/bbolt" ) const ( deleteOp uint32 = 1 << iota putOp restoreCallbackOp getOp chunkingPrefix = "raftchunking/" databaseFilename = "vault.db" ) var ( // dataBucketName is the value we use for the bucket dataBucketName = []byte("data") configBucketName = []byte("config") latestIndexKey = []byte("latest_indexes") latestConfigKey = []byte("latest_config") localNodeConfigKey = []byte("local_node_config") ) // Verify FSM satisfies the correct interfaces var ( _ physical.Backend = (*FSM)(nil) _ physical.Transactional = (*FSM)(nil) _ raft.FSM = (*FSM)(nil) _ raft.BatchingFSM = (*FSM)(nil) ) type restoreCallback func(context.Context) error type FSMEntry struct { Key string Value []byte } func (f *FSMEntry) String() string { return fmt.Sprintf("Key: %s. Value: %s", f.Key, hex.EncodeToString(f.Value)) } // FSMApplyResponse is returned from an FSM apply. It indicates if the apply was // successful or not. EntryMap contains the keys/values from the Get operations. type FSMApplyResponse struct { Success bool EntrySlice []*FSMEntry } // FSM is Vault's primary state storage. It writes updates to a bolt db file // that lives on local disk. FSM implements raft.FSM and physical.Backend // interfaces. type FSM struct { // latestIndex and latestTerm must stay at the top of this struct to be // properly 64-bit aligned. // latestIndex and latestTerm are the term and index of the last log we // received latestIndex *uint64 latestTerm *uint64 // latestConfig is the latest server configuration we've seen latestConfig atomic.Value l sync.RWMutex path string logger log.Logger noopRestore bool // applyCallback is used to control the pace of applies in tests applyCallback func() db *bolt.DB // retoreCb is called after we've restored a snapshot restoreCb restoreCallback chunker *raftchunking.ChunkingBatchingFSM localID string desiredSuffrage string unknownOpTypes sync.Map } // NewFSM constructs a FSM using the given directory func NewFSM(path string, localID string, logger log.Logger) (*FSM, error) { // Initialize the latest term, index, and config values latestTerm := new(uint64) latestIndex := new(uint64) latestConfig := atomic.Value{} atomic.StoreUint64(latestTerm, 0) atomic.StoreUint64(latestIndex, 0) latestConfig.Store((*ConfigurationValue)(nil)) f := &FSM{ path: path, logger: logger, latestTerm: latestTerm, latestIndex: latestIndex, latestConfig: latestConfig, // Assume that the default intent is to join as as voter. This will be updated // when this node joins a cluster with a different suffrage, or during cluster // setup if this is already part of a cluster with a desired suffrage. desiredSuffrage: "voter", localID: localID, } f.chunker = raftchunking.NewChunkingBatchingFSM(f, &FSMChunkStorage{ f: f, ctx: context.Background(), }) dbPath := filepath.Join(path, databaseFilename) f.l.Lock() defer f.l.Unlock() if err := f.openDBFile(dbPath); err != nil { return nil, fmt.Errorf("failed to open bolt file: %w", err) } return f, nil } func (f *FSM) getDB() *bolt.DB { f.l.RLock() defer f.l.RUnlock() return f.db } // SetFSMDelay adds a delay to the FSM apply. This is used in tests to simulate // a slow apply. func (r *RaftBackend) SetFSMDelay(delay time.Duration) { r.SetFSMApplyCallback(func() { time.Sleep(delay) }) } func (r *RaftBackend) SetFSMApplyCallback(f func()) { r.fsm.l.Lock() r.fsm.applyCallback = f r.fsm.l.Unlock() } func (f *FSM) openDBFile(dbPath string) error { if len(dbPath) == 0 { return errors.New("can not open empty filename") } st, err := os.Stat(dbPath) switch { case err != nil && os.IsNotExist(err): case err != nil: return fmt.Errorf("error checking raft FSM db file %q: %v", dbPath, err) default: perms := st.Mode() & os.ModePerm if perms&0o077 != 0 { f.logger.Warn("raft FSM db file has wider permissions than needed", "needed", os.FileMode(0o600), "existing", perms) } } opts := boltOptions(dbPath) start := time.Now() boltDB, err := bolt.Open(dbPath, 0o600, opts) if err != nil { return err } elapsed := time.Now().Sub(start) f.logger.Debug("time to open database", "elapsed", elapsed, "path", dbPath) metrics.MeasureSince([]string{"raft_storage", "fsm", "open_db_file"}, start) err = boltDB.Update(func(tx *bolt.Tx) error { // make sure we have the necessary buckets created _, err := tx.CreateBucketIfNotExists(dataBucketName) if err != nil { return fmt.Errorf("failed to create bucket: %v", err) } b, err := tx.CreateBucketIfNotExists(configBucketName) if err != nil { return fmt.Errorf("failed to create bucket: %v", err) } // Read in our latest index and term and populate it inmemory val := b.Get(latestIndexKey) if val != nil { var latest IndexValue err := proto.Unmarshal(val, &latest) if err != nil { return err } atomic.StoreUint64(f.latestTerm, latest.Term) atomic.StoreUint64(f.latestIndex, latest.Index) } // Read in our latest config and populate it inmemory val = b.Get(latestConfigKey) if val != nil { var latest ConfigurationValue err := proto.Unmarshal(val, &latest) if err != nil { return err } f.latestConfig.Store(&latest) } return nil }) if err != nil { return err } f.db = boltDB return nil } func (f *FSM) Stats() bolt.Stats { f.l.RLock() defer f.l.RUnlock() return f.db.Stats() } func (f *FSM) Close() error { f.l.RLock() defer f.l.RUnlock() return f.db.Close() } func writeSnapshotMetaToDB(metadata *raft.SnapshotMeta, db *bolt.DB) error { latestIndex := &IndexValue{ Term: metadata.Term, Index: metadata.Index, } indexBytes, err := proto.Marshal(latestIndex) if err != nil { return err } protoConfig := raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration) configBytes, err := proto.Marshal(protoConfig) if err != nil { return err } err = db.Update(func(tx *bolt.Tx) error { b, err := tx.CreateBucketIfNotExists(configBucketName) if err != nil { return err } err = b.Put(latestConfigKey, configBytes) if err != nil { return err } err = b.Put(latestIndexKey, indexBytes) if err != nil { return err } return nil }) if err != nil { return err } return nil } func (f *FSM) localNodeConfig() (*LocalNodeConfigValue, error) { var configBytes []byte if err := f.db.View(func(tx *bolt.Tx) error { value := tx.Bucket(configBucketName).Get(localNodeConfigKey) if value != nil { configBytes = make([]byte, len(value)) copy(configBytes, value) } return nil }); err != nil { return nil, err } if configBytes == nil { return nil, nil } var lnConfig LocalNodeConfigValue if configBytes != nil { err := proto.Unmarshal(configBytes, &lnConfig) if err != nil { return nil, err } f.desiredSuffrage = lnConfig.DesiredSuffrage return &lnConfig, nil } return nil, nil } func (f *FSM) DesiredSuffrage() string { f.l.RLock() defer f.l.RUnlock() return f.desiredSuffrage } func (f *FSM) upgradeLocalNodeConfig() error { f.l.Lock() defer f.l.Unlock() // Read the local node config lnConfig, err := f.localNodeConfig() if err != nil { return err } // Entry is already present. Get the suffrage value. if lnConfig != nil { f.desiredSuffrage = lnConfig.DesiredSuffrage return nil } // // This is the upgrade case where there is no entry // lnConfig = &LocalNodeConfigValue{} // Refer to the persisted latest raft config config := f.latestConfig.Load().(*ConfigurationValue) // If there is no config, then this is a fresh node coming up. This could end up // being a voter or non-voter. But by default assume that this is a voter. It // will be changed if this node joins the cluster as a non-voter. if config == nil { f.desiredSuffrage = "voter" lnConfig.DesiredSuffrage = f.desiredSuffrage return f.persistDesiredSuffrage(lnConfig) } // Get the last known suffrage of the node and assume that it is the desired // suffrage. There is no better alternative here. for _, srv := range config.Servers { if srv.Id == f.localID { switch srv.Suffrage { case int32(raft.Nonvoter): lnConfig.DesiredSuffrage = "non-voter" default: lnConfig.DesiredSuffrage = "voter" } // Bring the intent to the fsm instance. f.desiredSuffrage = lnConfig.DesiredSuffrage break } } return f.persistDesiredSuffrage(lnConfig) } // recordSuffrage is called when a node successfully joins the cluster. This // intent should land in the stored configuration. If the config isn't available // yet, we still go ahead and store the intent in the fsm. During the next // update to the configuration, this intent will be persisted. func (f *FSM) recordSuffrage(desiredSuffrage string) error { f.l.Lock() defer f.l.Unlock() if err := f.persistDesiredSuffrage(&LocalNodeConfigValue{ DesiredSuffrage: desiredSuffrage, }); err != nil { return err } f.desiredSuffrage = desiredSuffrage return nil } func (f *FSM) persistDesiredSuffrage(lnconfig *LocalNodeConfigValue) error { dsBytes, err := proto.Marshal(lnconfig) if err != nil { return err } return f.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(configBucketName).Put(localNodeConfigKey, dsBytes) }) } func (f *FSM) witnessSnapshot(metadata *raft.SnapshotMeta) error { f.l.RLock() defer f.l.RUnlock() err := writeSnapshotMetaToDB(metadata, f.db) if err != nil { return err } atomic.StoreUint64(f.latestIndex, metadata.Index) atomic.StoreUint64(f.latestTerm, metadata.Term) f.latestConfig.Store(raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration)) return nil } // LatestState returns the latest index and configuration values we have seen on // this FSM. func (f *FSM) LatestState() (*IndexValue, *ConfigurationValue) { return &IndexValue{ Term: atomic.LoadUint64(f.latestTerm), Index: atomic.LoadUint64(f.latestIndex), }, f.latestConfig.Load().(*ConfigurationValue) } // Delete deletes the given key from the bolt file. func (f *FSM) Delete(ctx context.Context, path string) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete"}, time.Now()) f.l.RLock() defer f.l.RUnlock() return f.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(dataBucketName).Delete([]byte(path)) }) } // Delete deletes the given key from the bolt file. func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete_prefix"}, time.Now()) f.l.RLock() defer f.l.RUnlock() err := f.db.Update(func(tx *bolt.Tx) error { // Assume bucket exists and has keys c := tx.Bucket(dataBucketName).Cursor() prefixBytes := []byte(prefix) for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() { if err := c.Delete(); err != nil { return err } } return nil }) return err } // Get retrieves the value at the given path from the bolt file. func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) { // TODO: Remove this outdated metric name in an older release defer metrics.MeasureSince([]string{"raft", "get"}, time.Now()) defer metrics.MeasureSince([]string{"raft_storage", "fsm", "get"}, time.Now()) f.l.RLock() defer f.l.RUnlock() var valCopy []byte var found bool err := f.db.View(func(tx *bolt.Tx) error { value := tx.Bucket(dataBucketName).Get([]byte(path)) if value != nil { found = true valCopy = make([]byte, len(value)) copy(valCopy, value) } return nil }) if err != nil { return nil, err } if !found { return nil, nil } return &physical.Entry{ Key: path, Value: valCopy, }, nil } // Put writes the given entry to the bolt file. func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "put"}, time.Now()) f.l.RLock() defer f.l.RUnlock() // Start a write transaction. return f.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(dataBucketName).Put([]byte(entry.Key), entry.Value) }) } // List retrieves the set of keys with the given prefix from the bolt file. func (f *FSM) List(ctx context.Context, prefix string) ([]string, error) { // TODO: Remove this outdated metric name in a future release defer metrics.MeasureSince([]string{"raft", "list"}, time.Now()) defer metrics.MeasureSince([]string{"raft_storage", "fsm", "list"}, time.Now()) f.l.RLock() defer f.l.RUnlock() var keys []string err := f.db.View(func(tx *bolt.Tx) error { // Assume bucket exists and has keys c := tx.Bucket(dataBucketName).Cursor() prefixBytes := []byte(prefix) for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() { key := string(k) key = strings.TrimPrefix(key, prefix) if i := strings.Index(key, "/"); i == -1 { // Add objects only from the current 'folder' keys = append(keys, key) } else { // Add truncated 'folder' paths if len(keys) == 0 || keys[len(keys)-1] != key[:i+1] { keys = append(keys, string(key[:i+1])) } } } return nil }) return keys, err } // Transaction writes all the operations in the provided transaction to the bolt // file. func (f *FSM) Transaction(ctx context.Context, txns []*physical.TxnEntry) error { f.l.RLock() defer f.l.RUnlock() // Start a write transaction. err := f.db.Update(func(tx *bolt.Tx) error { b := tx.Bucket(dataBucketName) for _, txn := range txns { var err error switch txn.Operation { case physical.PutOperation: err = b.Put([]byte(txn.Entry.Key), txn.Entry.Value) case physical.DeleteOperation: err = b.Delete([]byte(txn.Entry.Key)) default: return fmt.Errorf("%q is not a supported transaction operation", txn.Operation) } if err != nil { return err } } return nil }) return err } // ApplyBatch will apply a set of logs to the FSM. This is called from the raft // library. func (f *FSM) ApplyBatch(logs []*raft.Log) []interface{} { numLogs := len(logs) if numLogs == 0 { return []interface{}{} } // We will construct one slice per log, each slice containing another slice of results from our get ops entrySlices := make([][]*FSMEntry, 0, numLogs) // Do the unmarshalling first so we don't hold locks var latestConfiguration *ConfigurationValue commands := make([]interface{}, 0, numLogs) for _, l := range logs { switch l.Type { case raft.LogCommand: command := &LogData{} err := proto.Unmarshal(l.Data, command) if err != nil { f.logger.Error("error proto unmarshaling log data", "error", err) panic("error proto unmarshaling log data") } commands = append(commands, command) case raft.LogConfiguration: configuration := raft.DecodeConfiguration(l.Data) config := raftConfigurationToProtoConfiguration(l.Index, configuration) commands = append(commands, config) // Update the latest configuration the fsm has received; we will // store this after it has been committed to storage. latestConfiguration = config default: panic(fmt.Sprintf("got unexpected log type: %d", l.Type)) } } // Only advance latest pointer if this log has a higher index value than // what we have seen in the past. var logIndex []byte var err error latestIndex, _ := f.LatestState() lastLog := logs[numLogs-1] if latestIndex.Index < lastLog.Index { logIndex, err = proto.Marshal(&IndexValue{ Term: lastLog.Term, Index: lastLog.Index, }) if err != nil { f.logger.Error("unable to marshal latest index", "error", err) panic("unable to marshal latest index") } } f.l.RLock() defer f.l.RUnlock() if f.applyCallback != nil { f.applyCallback() } err = f.db.Update(func(tx *bolt.Tx) error { b := tx.Bucket(dataBucketName) for _, commandRaw := range commands { entrySlice := make([]*FSMEntry, 0) switch command := commandRaw.(type) { case *LogData: for _, op := range command.Operations { var err error switch op.OpType { case putOp: err = b.Put([]byte(op.Key), op.Value) case deleteOp: err = b.Delete([]byte(op.Key)) case getOp: fsmEntry := &FSMEntry{ Key: op.Key, } val := b.Get([]byte(op.Key)) if len(val) > 0 { newVal := make([]byte, len(val)) copy(newVal, val) fsmEntry.Value = newVal } entrySlice = append(entrySlice, fsmEntry) case restoreCallbackOp: if f.restoreCb != nil { // Kick off the restore callback function in a go routine go f.restoreCb(context.Background()) } default: if _, ok := f.unknownOpTypes.Load(op.OpType); !ok { f.logger.Error("unsupported transaction operation", "op", op.OpType) f.unknownOpTypes.Store(op.OpType, struct{}{}) } } if err != nil { return err } } case *ConfigurationValue: b := tx.Bucket(configBucketName) configBytes, err := proto.Marshal(command) if err != nil { return err } if err := b.Put(latestConfigKey, configBytes); err != nil { return err } } entrySlices = append(entrySlices, entrySlice) } if len(logIndex) > 0 { b := tx.Bucket(configBucketName) err = b.Put(latestIndexKey, logIndex) if err != nil { return err } } return nil }) if err != nil { f.logger.Error("failed to store data", "error", err) panic("failed to store data") } // If we advanced the latest value, update the in-memory representation too. if len(logIndex) > 0 { atomic.StoreUint64(f.latestTerm, lastLog.Term) atomic.StoreUint64(f.latestIndex, lastLog.Index) } // If one or more configuration changes were processed, store the latest one. if latestConfiguration != nil { f.latestConfig.Store(latestConfiguration) } // Build the responses. The logs array is used here to ensure we reply to // all command values; even if they are not of the types we expect. This // should futureproof this function from more log types being provided. resp := make([]interface{}, numLogs) for i := range logs { resp[i] = &FSMApplyResponse{ Success: true, EntrySlice: entrySlices[i], } } return resp } // Apply will apply a log value to the FSM. This is called from the raft // library. func (f *FSM) Apply(log *raft.Log) interface{} { return f.ApplyBatch([]*raft.Log{log})[0] } type writeErrorCloser interface { io.WriteCloser CloseWithError(error) error } // writeTo will copy the FSM's content to a remote sink. The data is written // twice, once for use in determining various metadata attributes of the dataset // (size, checksum, etc) and a second for the sink of the data. We also use a // proto delimited writer so we can stream proto messages to the sink. func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink writeErrorCloser) { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "write_snapshot"}, time.Now()) protoWriter := NewDelimitedWriter(sink) metadataProtoWriter := NewDelimitedWriter(metaSink) f.l.RLock() defer f.l.RUnlock() err := f.db.View(func(tx *bolt.Tx) error { b := tx.Bucket(dataBucketName) c := b.Cursor() // Do the first scan of the data for metadata purposes. for k, v := c.First(); k != nil; k, v = c.Next() { err := metadataProtoWriter.WriteMsg(&pb.StorageEntry{ Key: string(k), Value: v, }) if err != nil { metaSink.CloseWithError(err) return err } } metaSink.Close() // Do the second scan for copy purposes. for k, v := c.First(); k != nil; k, v = c.Next() { err := protoWriter.WriteMsg(&pb.StorageEntry{ Key: string(k), Value: v, }) if err != nil { return err } } return nil }) sink.CloseWithError(err) } // Snapshot implements the FSM interface. It returns a noop snapshot object. func (f *FSM) Snapshot() (raft.FSMSnapshot, error) { return &noopSnapshotter{ fsm: f, }, nil } // SetNoopRestore is used to disable restore operations on raft startup. Because // we are using persistent storage in our FSM we do not need to issue a restore // on startup. func (f *FSM) SetNoopRestore(enabled bool) { f.l.Lock() f.noopRestore = enabled f.l.Unlock() } // Restore installs a new snapshot from the provided reader. It does an atomic // rename of the snapshot file into the database filepath. While a restore is // happening the FSM is locked and no writes or reads can be performed. func (f *FSM) Restore(r io.ReadCloser) error { defer metrics.MeasureSince([]string{"raft_storage", "fsm", "restore_snapshot"}, time.Now()) if f.noopRestore { return nil } snapshotInstaller, ok := r.(*boltSnapshotInstaller) if !ok { wrapper, ok := r.(raft.ReadCloserWrapper) if !ok { return fmt.Errorf("expected ReadCloserWrapper object, got: %T", r) } snapshotInstallerRaw := wrapper.WrappedReadCloser() snapshotInstaller, ok = snapshotInstallerRaw.(*boltSnapshotInstaller) if !ok { return fmt.Errorf("expected snapshot installer object, got: %T", snapshotInstallerRaw) } } f.l.Lock() defer f.l.Unlock() // Cache the local node config before closing the db file lnConfig, err := f.localNodeConfig() if err != nil { return err } // Close the db file if err := f.db.Close(); err != nil { f.logger.Error("failed to close database file", "error", err) return err } dbPath := filepath.Join(f.path, databaseFilename) f.logger.Info("installing snapshot to FSM") // Install the new boltdb file var retErr *multierror.Error if err := snapshotInstaller.Install(dbPath); err != nil { f.logger.Error("failed to install snapshot", "error", err) retErr = multierror.Append(retErr, fmt.Errorf("failed to install snapshot database: %w", err)) } else { f.logger.Info("snapshot installed") } // Open the db file. We want to do this regardless of if the above install // worked. If the install failed we should try to open the old DB file. if err := f.openDBFile(dbPath); err != nil { f.logger.Error("failed to open new database file", "error", err) retErr = multierror.Append(retErr, fmt.Errorf("failed to open new bolt file: %w", err)) } // Handle local node config restore. lnConfig should not be nil here, but // adding the nil check anyways for safety. if lnConfig != nil { // Persist the local node config on the restored fsm. if err := f.persistDesiredSuffrage(lnConfig); err != nil { f.logger.Error("failed to persist local node config from before the restore", "error", err) retErr = multierror.Append(retErr, fmt.Errorf("failed to persist local node config from before the restore: %w", err)) } } return retErr.ErrorOrNil() } // noopSnapshotter implements the fsm.Snapshot interface. It doesn't do anything // since our SnapshotStore reads data out of the FSM on Open(). type noopSnapshotter struct { fsm *FSM } // Persist implements the fsm.Snapshot interface. It doesn't need to persist any // state data, but it does persist the raft metadata. This is necessary so we // can be sure to capture indexes for operation types that are not sent to the // FSM. func (s *noopSnapshotter) Persist(sink raft.SnapshotSink) error { boltSnapshotSink := sink.(*BoltSnapshotSink) // We are processing a snapshot, fastforward the index, term, and // configuration to the latest seen by the raft system. if err := s.fsm.witnessSnapshot(&boltSnapshotSink.meta); err != nil { return err } return nil } // Release doesn't do anything. func (s *noopSnapshotter) Release() {} // raftConfigurationToProtoConfiguration converts a raft configuration object to // a proto value. func raftConfigurationToProtoConfiguration(index uint64, configuration raft.Configuration) *ConfigurationValue { servers := make([]*Server, len(configuration.Servers)) for i, s := range configuration.Servers { servers[i] = &Server{ Suffrage: int32(s.Suffrage), Id: string(s.ID), Address: string(s.Address), } } return &ConfigurationValue{ Index: index, Servers: servers, } } // protoConfigurationToRaftConfiguration converts a proto configuration object // to a raft object. func protoConfigurationToRaftConfiguration(configuration *ConfigurationValue) (uint64, raft.Configuration) { servers := make([]raft.Server, len(configuration.Servers)) for i, s := range configuration.Servers { servers[i] = raft.Server{ Suffrage: raft.ServerSuffrage(s.Suffrage), ID: raft.ServerID(s.Id), Address: raft.ServerAddress(s.Address), } } return configuration.Index, raft.Configuration{ Servers: servers, } } type FSMChunkStorage struct { f *FSM ctx context.Context } // chunkPaths returns a disk prefix and key given chunkinfo func (f *FSMChunkStorage) chunkPaths(chunk *raftchunking.ChunkInfo) (string, string) { prefix := fmt.Sprintf("%s%d/", chunkingPrefix, chunk.OpNum) key := fmt.Sprintf("%s%d", prefix, chunk.SequenceNum) return prefix, key } func (f *FSMChunkStorage) StoreChunk(chunk *raftchunking.ChunkInfo) (bool, error) { b, err := jsonutil.EncodeJSON(chunk) if err != nil { return false, fmt.Errorf("error encoding chunk info: %w", err) } prefix, key := f.chunkPaths(chunk) entry := &physical.Entry{ Key: key, Value: b, } f.f.l.RLock() defer f.f.l.RUnlock() // Start a write transaction. done := new(bool) if err := f.f.db.Update(func(tx *bolt.Tx) error { if err := tx.Bucket(dataBucketName).Put([]byte(entry.Key), entry.Value); err != nil { return fmt.Errorf("error storing chunk info: %w", err) } // Assume bucket exists and has keys c := tx.Bucket(dataBucketName).Cursor() var keys []string prefixBytes := []byte(prefix) for k, _ := c.Seek(prefixBytes); k != nil && bytes.HasPrefix(k, prefixBytes); k, _ = c.Next() { key := string(k) key = strings.TrimPrefix(key, prefix) if i := strings.Index(key, "/"); i == -1 { // Add objects only from the current 'folder' keys = append(keys, key) } else { // Add truncated 'folder' paths keys = strutil.AppendIfMissing(keys, string(key[:i+1])) } } *done = uint32(len(keys)) == chunk.NumChunks return nil }); err != nil { return false, err } return *done, nil } func (f *FSMChunkStorage) FinalizeOp(opNum uint64) ([]*raftchunking.ChunkInfo, error) { ret, err := f.chunksForOpNum(opNum) if err != nil { return nil, fmt.Errorf("error getting chunks for op keys: %w", err) } prefix, _ := f.chunkPaths(&raftchunking.ChunkInfo{OpNum: opNum}) if err := f.f.DeletePrefix(f.ctx, prefix); err != nil { return nil, fmt.Errorf("error deleting prefix after op finalization: %w", err) } return ret, nil } func (f *FSMChunkStorage) chunksForOpNum(opNum uint64) ([]*raftchunking.ChunkInfo, error) { prefix, _ := f.chunkPaths(&raftchunking.ChunkInfo{OpNum: opNum}) opChunkKeys, err := f.f.List(f.ctx, prefix) if err != nil { return nil, fmt.Errorf("error fetching op chunk keys: %w", err) } if len(opChunkKeys) == 0 { return nil, nil } var ret []*raftchunking.ChunkInfo for _, v := range opChunkKeys { seqNum, err := strconv.ParseInt(v, 10, 64) if err != nil { return nil, fmt.Errorf("error converting seqnum to integer: %w", err) } entry, err := f.f.Get(f.ctx, prefix+v) if err != nil { return nil, fmt.Errorf("error fetching chunkinfo: %w", err) } var ci raftchunking.ChunkInfo if err := jsonutil.DecodeJSON(entry.Value, &ci); err != nil { return nil, fmt.Errorf("error decoding chunkinfo json: %w", err) } if ret == nil { ret = make([]*raftchunking.ChunkInfo, ci.NumChunks) } ret[seqNum] = &ci } return ret, nil } func (f *FSMChunkStorage) GetChunks() (raftchunking.ChunkMap, error) { opNums, err := f.f.List(f.ctx, chunkingPrefix) if err != nil { return nil, fmt.Errorf("error doing recursive list for chunk saving: %w", err) } if len(opNums) == 0 { return nil, nil } ret := make(raftchunking.ChunkMap, len(opNums)) for _, opNumStr := range opNums { opNum, err := strconv.ParseInt(opNumStr, 10, 64) if err != nil { return nil, fmt.Errorf("error parsing op num during chunk saving: %w", err) } opChunks, err := f.chunksForOpNum(uint64(opNum)) if err != nil { return nil, fmt.Errorf("error getting chunks for op keys during chunk saving: %w", err) } ret[uint64(opNum)] = opChunks } return ret, nil } func (f *FSMChunkStorage) RestoreChunks(chunks raftchunking.ChunkMap) error { if err := f.f.DeletePrefix(f.ctx, chunkingPrefix); err != nil { return fmt.Errorf("error deleting prefix for chunk restoration: %w", err) } if len(chunks) == 0 { return nil } for opNum, opChunks := range chunks { for _, chunk := range opChunks { if chunk == nil { continue } if chunk.OpNum != opNum { return errors.New("unexpected op number in chunk") } if _, err := f.StoreChunk(chunk); err != nil { return fmt.Errorf("error storing chunk during restoration: %w", err) } } } return nil }