bc29610124
* Updates Raft library to get new snapshot/restore API. * Basic backup and restore working, but need some cleanup. * Breaks out a snapshot module and adds a SHA256 integrity check. * Adds snapshot ACL and fills in some missing comments. * Require a consistent read for snapshots. * Make sure snapshot works if ACLs aren't enabled. * Adds a bit of package documentation. * Returns an empty response from restore to avoid EOF errors. * Adds API client support for snapshots. * Makes internal file names match on-disk file snapshots. * Adds DC and token coverage for snapshot API test. * Adds missing documentation. * Adds a unit test for the snapshot client endpoint. * Moves the connection pool out of the client for easier testing. * Fixes an incidental issue in the prepared query unit test. I realized I had two servers in bootstrap mode so this wasn't a good setup. * Adds a half close to the TCP stream and fixes panic on error. * Adds client and endpoint tests for snapshots. * Moves the pool back into the snapshot RPC client. * Adds a TLS test and fixes half-closes for TLS connections. * Tweaks some comments. * Adds a low-level snapshot test. This is independent of Consul so we can pull this out into a library later if we want to. * Cleans up snapshot and archive and completes archive tests. * Sends a clear error for snapshot operations in dev mode. Snapshots require the Raft snapshots to be readable, which isn't supported in dev mode. Send a clear error instead of a deep-down Raft one. * Adds docs for the snapshot endpoint. * Adds a stale mode and index feedback for snapshot saves. This gives folks a way to extract data even if the cluster has no leader. * Changes the internal format of a snapshot from zip to tgz. * Pulls in Raft fix to cancel inflight before a restore. * Pulls in new Raft restore interface. * Adds metadata to snapshot saves and a verify function. * Adds basic save and restore snapshot CLI commands. * Gets rid of tarball extensions and adds restore message. * Fixes an incidental bad link in the KV docs. * Adds documentation for the snapshot CLI commands. * Scuttle any request body when a snapshot is saved. * Fixes archive unit test error message check. * Allows for nil output writers in snapshot RPC handlers. * Renames hash list Decode to DecodeAndVerify. * Closes the client connection for snapshot ops. * Lowers timeout for restore ops. * Updates Raft vendor to get new Restore signature and integrates with Consul. * Bounces the leader's internal state when we do a restore.
137 lines
3.5 KiB
Go
137 lines
3.5 KiB
Go
package raft
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"time"
|
|
|
|
"github.com/armon/go-metrics"
|
|
)
|
|
|
|
// FSM provides an interface that can be implemented by
|
|
// clients to make use of the replicated log.
|
|
type FSM interface {
|
|
// Apply log is invoked once a log entry is committed.
|
|
// It returns a value which will be made available in the
|
|
// ApplyFuture returned by Raft.Apply method if that
|
|
// method was called on the same Raft node as the FSM.
|
|
Apply(*Log) interface{}
|
|
|
|
// Snapshot is used to support log compaction. This call should
|
|
// return an FSMSnapshot which can be used to save a point-in-time
|
|
// snapshot of the FSM. Apply and Snapshot are not called in multiple
|
|
// threads, but Apply will be called concurrently with Persist. This means
|
|
// the FSM should be implemented in a fashion that allows for concurrent
|
|
// updates while a snapshot is happening.
|
|
Snapshot() (FSMSnapshot, error)
|
|
|
|
// Restore is used to restore an FSM from a snapshot. It is not called
|
|
// concurrently with any other command. The FSM must discard all previous
|
|
// state.
|
|
Restore(io.ReadCloser) error
|
|
}
|
|
|
|
// FSMSnapshot is returned by an FSM in response to a Snapshot
|
|
// It must be safe to invoke FSMSnapshot methods with concurrent
|
|
// calls to Apply.
|
|
type FSMSnapshot interface {
|
|
// Persist should dump all necessary state to the WriteCloser 'sink',
|
|
// and call sink.Close() when finished or call sink.Cancel() on error.
|
|
Persist(sink SnapshotSink) error
|
|
|
|
// Release is invoked when we are finished with the snapshot.
|
|
Release()
|
|
}
|
|
|
|
// runFSM is a long running goroutine responsible for applying logs
|
|
// to the FSM. This is done async of other logs since we don't want
|
|
// the FSM to block our internal operations.
|
|
func (r *Raft) runFSM() {
|
|
var lastIndex, lastTerm uint64
|
|
|
|
commit := func(req *commitTuple) {
|
|
// Apply the log if a command
|
|
var resp interface{}
|
|
if req.log.Type == LogCommand {
|
|
start := time.Now()
|
|
resp = r.fsm.Apply(req.log)
|
|
metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start)
|
|
}
|
|
|
|
// Update the indexes
|
|
lastIndex = req.log.Index
|
|
lastTerm = req.log.Term
|
|
|
|
// Invoke the future if given
|
|
if req.future != nil {
|
|
req.future.response = resp
|
|
req.future.respond(nil)
|
|
}
|
|
}
|
|
|
|
restore := func(req *restoreFuture) {
|
|
// Open the snapshot
|
|
meta, source, err := r.snapshots.Open(req.ID)
|
|
if err != nil {
|
|
req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err))
|
|
return
|
|
}
|
|
|
|
// Attempt to restore
|
|
start := time.Now()
|
|
if err := r.fsm.Restore(source); err != nil {
|
|
req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err))
|
|
source.Close()
|
|
return
|
|
}
|
|
source.Close()
|
|
metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start)
|
|
|
|
// Update the last index and term
|
|
lastIndex = meta.Index
|
|
lastTerm = meta.Term
|
|
req.respond(nil)
|
|
}
|
|
|
|
snapshot := func(req *reqSnapshotFuture) {
|
|
// Is there something to snapshot?
|
|
if lastIndex == 0 {
|
|
req.respond(ErrNothingNewToSnapshot)
|
|
return
|
|
}
|
|
|
|
// Start a snapshot
|
|
start := time.Now()
|
|
snap, err := r.fsm.Snapshot()
|
|
metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start)
|
|
|
|
// Respond to the request
|
|
req.index = lastIndex
|
|
req.term = lastTerm
|
|
req.snapshot = snap
|
|
req.respond(err)
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case ptr := <-r.fsmMutateCh:
|
|
switch req := ptr.(type) {
|
|
case *commitTuple:
|
|
commit(req)
|
|
|
|
case *restoreFuture:
|
|
restore(req)
|
|
|
|
default:
|
|
panic(fmt.Errorf("bad type passed to fsmMutateCh: %#v", ptr))
|
|
}
|
|
|
|
case req := <-r.fsmSnapshotCh:
|
|
snapshot(req)
|
|
|
|
case <-r.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|