333ff22e9a
Embrace the future and use Go 1.6's vendor support via Godep. Go 1.5 users should `export GO15VENDOREXPERIMENT=1`
1888 lines
51 KiB
Go
1888 lines
51 KiB
Go
package raft
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"os"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/armon/go-metrics"
|
|
)
|
|
|
|
const (
|
|
minCheckInterval = 10 * time.Millisecond
|
|
)
|
|
|
|
var (
|
|
keyCurrentTerm = []byte("CurrentTerm")
|
|
keyLastVoteTerm = []byte("LastVoteTerm")
|
|
keyLastVoteCand = []byte("LastVoteCand")
|
|
|
|
// ErrLeader is returned when an operation can't be completed on a
|
|
// leader node.
|
|
ErrLeader = errors.New("node is the leader")
|
|
|
|
// ErrNotLeader is returned when an operation can't be completed on a
|
|
// follower or candidate node.
|
|
ErrNotLeader = errors.New("node is not the leader")
|
|
|
|
// ErrLeadershipLost is returned when a leader fails to commit a log entry
|
|
// because it's been deposed in the process.
|
|
ErrLeadershipLost = errors.New("leadership lost while committing log")
|
|
|
|
// ErrRaftShutdown is returned when operations are requested against an
|
|
// inactive Raft.
|
|
ErrRaftShutdown = errors.New("raft is already shutdown")
|
|
|
|
// ErrEnqueueTimeout is returned when a command fails due to a timeout.
|
|
ErrEnqueueTimeout = errors.New("timed out enqueuing operation")
|
|
|
|
// ErrKnownPeer is returned when trying to add a peer to the configuration
|
|
// that already exists.
|
|
ErrKnownPeer = errors.New("peer already known")
|
|
|
|
// ErrUnknownPeer is returned when trying to remove a peer from the
|
|
// configuration that doesn't exist.
|
|
ErrUnknownPeer = errors.New("peer is unknown")
|
|
|
|
// ErrNothingNewToSnapshot is returned when trying to create a snapshot
|
|
// but there's nothing new commited to the FSM since we started.
|
|
ErrNothingNewToSnapshot = errors.New("Nothing new to snapshot")
|
|
)
|
|
|
|
// commitTuple is used to send an index that was committed,
|
|
// with an optional associated future that should be invoked.
|
|
type commitTuple struct {
|
|
log *Log
|
|
future *logFuture
|
|
}
|
|
|
|
// leaderState is state that is used while we are a leader.
|
|
type leaderState struct {
|
|
commitCh chan struct{}
|
|
inflight *inflight
|
|
replState map[string]*followerReplication
|
|
notify map[*verifyFuture]struct{}
|
|
stepDown chan struct{}
|
|
}
|
|
|
|
// Raft implements a Raft node.
|
|
type Raft struct {
|
|
raftState
|
|
|
|
// applyCh is used to async send logs to the main thread to
|
|
// be committed and applied to the FSM.
|
|
applyCh chan *logFuture
|
|
|
|
// Configuration provided at Raft initialization
|
|
conf *Config
|
|
|
|
// FSM is the client state machine to apply commands to
|
|
fsm FSM
|
|
|
|
// fsmCommitCh is used to trigger async application of logs to the fsm
|
|
fsmCommitCh chan commitTuple
|
|
|
|
// fsmRestoreCh is used to trigger a restore from snapshot
|
|
fsmRestoreCh chan *restoreFuture
|
|
|
|
// fsmSnapshotCh is used to trigger a new snapshot being taken
|
|
fsmSnapshotCh chan *reqSnapshotFuture
|
|
|
|
// lastContact is the last time we had contact from the
|
|
// leader node. This can be used to gauge staleness.
|
|
lastContact time.Time
|
|
lastContactLock sync.RWMutex
|
|
|
|
// Leader is the current cluster leader
|
|
leader string
|
|
leaderLock sync.RWMutex
|
|
|
|
// leaderCh is used to notify of leadership changes
|
|
leaderCh chan bool
|
|
|
|
// leaderState used only while state is leader
|
|
leaderState leaderState
|
|
|
|
// Stores our local addr
|
|
localAddr string
|
|
|
|
// Used for our logging
|
|
logger *log.Logger
|
|
|
|
// LogStore provides durable storage for logs
|
|
logs LogStore
|
|
|
|
// Track our known peers
|
|
peerCh chan *peerFuture
|
|
peers []string
|
|
peerStore PeerStore
|
|
|
|
// RPC chan comes from the transport layer
|
|
rpcCh <-chan RPC
|
|
|
|
// Shutdown channel to exit, protected to prevent concurrent exits
|
|
shutdown bool
|
|
shutdownCh chan struct{}
|
|
shutdownLock sync.Mutex
|
|
|
|
// snapshots is used to store and retrieve snapshots
|
|
snapshots SnapshotStore
|
|
|
|
// snapshotCh is used for user triggered snapshots
|
|
snapshotCh chan *snapshotFuture
|
|
|
|
// stable is a StableStore implementation for durable state
|
|
// It provides stable storage for many fields in raftState
|
|
stable StableStore
|
|
|
|
// The transport layer we use
|
|
trans Transport
|
|
|
|
// verifyCh is used to async send verify futures to the main thread
|
|
// to verify we are still the leader
|
|
verifyCh chan *verifyFuture
|
|
}
|
|
|
|
// NewRaft is used to construct a new Raft node. It takes a configuration, as well
|
|
// as implementations of various interfaces that are required. If we have any old state,
|
|
// such as snapshots, logs, peers, etc, all those will be restored when creating the
|
|
// Raft node.
|
|
func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore,
|
|
peerStore PeerStore, trans Transport) (*Raft, error) {
|
|
// Validate the configuration
|
|
if err := ValidateConfig(conf); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Ensure we have a LogOutput
|
|
var logger *log.Logger
|
|
if conf.Logger != nil {
|
|
logger = conf.Logger
|
|
} else {
|
|
if conf.LogOutput == nil {
|
|
conf.LogOutput = os.Stderr
|
|
}
|
|
logger = log.New(conf.LogOutput, "", log.LstdFlags)
|
|
}
|
|
|
|
// Try to restore the current term
|
|
currentTerm, err := stable.GetUint64(keyCurrentTerm)
|
|
if err != nil && err.Error() != "not found" {
|
|
return nil, fmt.Errorf("failed to load current term: %v", err)
|
|
}
|
|
|
|
// Read the last log value
|
|
lastIdx, err := logs.LastIndex()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to find last log: %v", err)
|
|
}
|
|
|
|
// Get the log
|
|
var lastLog Log
|
|
if lastIdx > 0 {
|
|
if err := logs.GetLog(lastIdx, &lastLog); err != nil {
|
|
return nil, fmt.Errorf("failed to get last log: %v", err)
|
|
}
|
|
}
|
|
|
|
// Construct the list of peers that excludes us
|
|
localAddr := trans.LocalAddr()
|
|
peers, err := peerStore.Peers()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get list of peers: %v", err)
|
|
}
|
|
peers = ExcludePeer(peers, localAddr)
|
|
|
|
// Create Raft struct
|
|
r := &Raft{
|
|
applyCh: make(chan *logFuture),
|
|
conf: conf,
|
|
fsm: fsm,
|
|
fsmCommitCh: make(chan commitTuple, 128),
|
|
fsmRestoreCh: make(chan *restoreFuture),
|
|
fsmSnapshotCh: make(chan *reqSnapshotFuture),
|
|
leaderCh: make(chan bool),
|
|
localAddr: localAddr,
|
|
logger: logger,
|
|
logs: logs,
|
|
peerCh: make(chan *peerFuture),
|
|
peers: peers,
|
|
peerStore: peerStore,
|
|
rpcCh: trans.Consumer(),
|
|
snapshots: snaps,
|
|
snapshotCh: make(chan *snapshotFuture),
|
|
shutdownCh: make(chan struct{}),
|
|
stable: stable,
|
|
trans: trans,
|
|
verifyCh: make(chan *verifyFuture, 64),
|
|
}
|
|
|
|
// Initialize as a follower
|
|
r.setState(Follower)
|
|
|
|
// Start as leader if specified. This should only be used
|
|
// for testing purposes.
|
|
if conf.StartAsLeader {
|
|
r.setState(Leader)
|
|
r.setLeader(r.localAddr)
|
|
}
|
|
|
|
// Restore the current term and the last log
|
|
r.setCurrentTerm(currentTerm)
|
|
r.setLastLogIndex(lastLog.Index)
|
|
r.setLastLogTerm(lastLog.Term)
|
|
|
|
// Attempt to restore a snapshot if there are any
|
|
if err := r.restoreSnapshot(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Setup a heartbeat fast-path to avoid head-of-line
|
|
// blocking where possible. It MUST be safe for this
|
|
// to be called concurrently with a blocking RPC.
|
|
trans.SetHeartbeatHandler(r.processHeartbeat)
|
|
|
|
// Start the background work
|
|
r.goFunc(r.run)
|
|
r.goFunc(r.runFSM)
|
|
r.goFunc(r.runSnapshots)
|
|
return r, nil
|
|
}
|
|
|
|
// Leader is used to return the current leader of the cluster.
|
|
// It may return empty string if there is no current leader
|
|
// or the leader is unknown.
|
|
func (r *Raft) Leader() string {
|
|
r.leaderLock.RLock()
|
|
leader := r.leader
|
|
r.leaderLock.RUnlock()
|
|
return leader
|
|
}
|
|
|
|
// setLeader is used to modify the current leader of the cluster
|
|
func (r *Raft) setLeader(leader string) {
|
|
r.leaderLock.Lock()
|
|
r.leader = leader
|
|
r.leaderLock.Unlock()
|
|
}
|
|
|
|
// Apply is used to apply a command to the FSM in a highly consistent
|
|
// manner. This returns a future that can be used to wait on the application.
|
|
// An optional timeout can be provided to limit the amount of time we wait
|
|
// for the command to be started. This must be run on the leader or it
|
|
// will fail.
|
|
func (r *Raft) Apply(cmd []byte, timeout time.Duration) ApplyFuture {
|
|
metrics.IncrCounter([]string{"raft", "apply"}, 1)
|
|
var timer <-chan time.Time
|
|
if timeout > 0 {
|
|
timer = time.After(timeout)
|
|
}
|
|
|
|
// Create a log future, no index or term yet
|
|
logFuture := &logFuture{
|
|
log: Log{
|
|
Type: LogCommand,
|
|
Data: cmd,
|
|
},
|
|
}
|
|
logFuture.init()
|
|
|
|
select {
|
|
case <-timer:
|
|
return errorFuture{ErrEnqueueTimeout}
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
case r.applyCh <- logFuture:
|
|
return logFuture
|
|
}
|
|
}
|
|
|
|
// Barrier is used to issue a command that blocks until all preceeding
|
|
// operations have been applied to the FSM. It can be used to ensure the
|
|
// FSM reflects all queued writes. An optional timeout can be provided to
|
|
// limit the amount of time we wait for the command to be started. This
|
|
// must be run on the leader or it will fail.
|
|
func (r *Raft) Barrier(timeout time.Duration) Future {
|
|
metrics.IncrCounter([]string{"raft", "barrier"}, 1)
|
|
var timer <-chan time.Time
|
|
if timeout > 0 {
|
|
timer = time.After(timeout)
|
|
}
|
|
|
|
// Create a log future, no index or term yet
|
|
logFuture := &logFuture{
|
|
log: Log{
|
|
Type: LogBarrier,
|
|
},
|
|
}
|
|
logFuture.init()
|
|
|
|
select {
|
|
case <-timer:
|
|
return errorFuture{ErrEnqueueTimeout}
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
case r.applyCh <- logFuture:
|
|
return logFuture
|
|
}
|
|
}
|
|
|
|
// VerifyLeader is used to ensure the current node is still
|
|
// the leader. This can be done to prevent stale reads when a
|
|
// new leader has potentially been elected.
|
|
func (r *Raft) VerifyLeader() Future {
|
|
metrics.IncrCounter([]string{"raft", "verify_leader"}, 1)
|
|
verifyFuture := &verifyFuture{}
|
|
verifyFuture.init()
|
|
select {
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
case r.verifyCh <- verifyFuture:
|
|
return verifyFuture
|
|
}
|
|
}
|
|
|
|
// AddPeer is used to add a new peer into the cluster. This must be
|
|
// run on the leader or it will fail.
|
|
func (r *Raft) AddPeer(peer string) Future {
|
|
logFuture := &logFuture{
|
|
log: Log{
|
|
Type: LogAddPeer,
|
|
peer: peer,
|
|
},
|
|
}
|
|
logFuture.init()
|
|
select {
|
|
case r.applyCh <- logFuture:
|
|
return logFuture
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
}
|
|
}
|
|
|
|
// RemovePeer is used to remove a peer from the cluster. If the
|
|
// current leader is being removed, it will cause a new election
|
|
// to occur. This must be run on the leader or it will fail.
|
|
func (r *Raft) RemovePeer(peer string) Future {
|
|
logFuture := &logFuture{
|
|
log: Log{
|
|
Type: LogRemovePeer,
|
|
peer: peer,
|
|
},
|
|
}
|
|
logFuture.init()
|
|
select {
|
|
case r.applyCh <- logFuture:
|
|
return logFuture
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
}
|
|
}
|
|
|
|
// SetPeers is used to forcibly replace the set of internal peers and
|
|
// the peerstore with the ones specified. This can be considered unsafe.
|
|
func (r *Raft) SetPeers(p []string) Future {
|
|
peerFuture := &peerFuture{
|
|
peers: p,
|
|
}
|
|
peerFuture.init()
|
|
|
|
select {
|
|
case r.peerCh <- peerFuture:
|
|
return peerFuture
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
}
|
|
}
|
|
|
|
// Shutdown is used to stop the Raft background routines.
|
|
// This is not a graceful operation. Provides a future that
|
|
// can be used to block until all background routines have exited.
|
|
func (r *Raft) Shutdown() Future {
|
|
r.shutdownLock.Lock()
|
|
defer r.shutdownLock.Unlock()
|
|
|
|
if !r.shutdown {
|
|
close(r.shutdownCh)
|
|
r.shutdown = true
|
|
r.setState(Shutdown)
|
|
}
|
|
|
|
return &shutdownFuture{r}
|
|
}
|
|
|
|
// Snapshot is used to manually force Raft to take a snapshot.
|
|
// Returns a future that can be used to block until complete.
|
|
func (r *Raft) Snapshot() Future {
|
|
snapFuture := &snapshotFuture{}
|
|
snapFuture.init()
|
|
select {
|
|
case r.snapshotCh <- snapFuture:
|
|
return snapFuture
|
|
case <-r.shutdownCh:
|
|
return errorFuture{ErrRaftShutdown}
|
|
}
|
|
|
|
}
|
|
|
|
// State is used to return the current raft state.
|
|
func (r *Raft) State() RaftState {
|
|
return r.getState()
|
|
}
|
|
|
|
// LeaderCh is used to get a channel which delivers signals on
|
|
// acquiring or losing leadership. It sends true if we become
|
|
// the leader, and false if we lose it. The channel is not buffered,
|
|
// and does not block on writes.
|
|
func (r *Raft) LeaderCh() <-chan bool {
|
|
return r.leaderCh
|
|
}
|
|
|
|
func (r *Raft) String() string {
|
|
return fmt.Sprintf("Node at %s [%v]", r.localAddr, r.getState())
|
|
}
|
|
|
|
// LastContact returns the time of last contact by a leader.
|
|
// This only makes sense if we are currently a follower.
|
|
func (r *Raft) LastContact() time.Time {
|
|
r.lastContactLock.RLock()
|
|
last := r.lastContact
|
|
r.lastContactLock.RUnlock()
|
|
return last
|
|
}
|
|
|
|
// Stats is used to return a map of various internal stats. This should only
|
|
// be used for informative purposes or debugging.
|
|
func (r *Raft) Stats() map[string]string {
|
|
toString := func(v uint64) string {
|
|
return strconv.FormatUint(v, 10)
|
|
}
|
|
s := map[string]string{
|
|
"state": r.getState().String(),
|
|
"term": toString(r.getCurrentTerm()),
|
|
"last_log_index": toString(r.getLastLogIndex()),
|
|
"last_log_term": toString(r.getLastLogTerm()),
|
|
"commit_index": toString(r.getCommitIndex()),
|
|
"applied_index": toString(r.getLastApplied()),
|
|
"fsm_pending": toString(uint64(len(r.fsmCommitCh))),
|
|
"last_snapshot_index": toString(r.getLastSnapshotIndex()),
|
|
"last_snapshot_term": toString(r.getLastSnapshotTerm()),
|
|
"num_peers": toString(uint64(len(r.peers))),
|
|
}
|
|
last := r.LastContact()
|
|
if last.IsZero() {
|
|
s["last_contact"] = "never"
|
|
} else if r.getState() == Leader {
|
|
s["last_contact"] = "0"
|
|
} else {
|
|
s["last_contact"] = fmt.Sprintf("%v", time.Now().Sub(last))
|
|
}
|
|
return s
|
|
}
|
|
|
|
// LastIndex returns the last index in stable storage,
|
|
// either from the last log or from the last snapshot.
|
|
func (r *Raft) LastIndex() uint64 {
|
|
return r.getLastIndex()
|
|
}
|
|
|
|
// AppliedIndex returns the last index applied to the FSM.
|
|
// This is generally lagging behind the last index, especially
|
|
// for indexes that are persisted but have not yet been considered
|
|
// committed by the leader.
|
|
func (r *Raft) AppliedIndex() uint64 {
|
|
return r.getLastApplied()
|
|
}
|
|
|
|
// runFSM is a long running goroutine responsible for applying logs
|
|
// to the FSM. This is done async of other logs since we don't want
|
|
// the FSM to block our internal operations.
|
|
func (r *Raft) runFSM() {
|
|
var lastIndex, lastTerm uint64
|
|
for {
|
|
select {
|
|
case req := <-r.fsmRestoreCh:
|
|
// Open the snapshot
|
|
meta, source, err := r.snapshots.Open(req.ID)
|
|
if err != nil {
|
|
req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err))
|
|
continue
|
|
}
|
|
|
|
// Attempt to restore
|
|
start := time.Now()
|
|
if err := r.fsm.Restore(source); err != nil {
|
|
req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err))
|
|
source.Close()
|
|
continue
|
|
}
|
|
source.Close()
|
|
metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start)
|
|
|
|
// Update the last index and term
|
|
lastIndex = meta.Index
|
|
lastTerm = meta.Term
|
|
req.respond(nil)
|
|
|
|
case req := <-r.fsmSnapshotCh:
|
|
// Is there something to snapshot?
|
|
if lastIndex == 0 {
|
|
req.respond(ErrNothingNewToSnapshot)
|
|
continue
|
|
}
|
|
|
|
// Get our peers
|
|
peers, err := r.peerStore.Peers()
|
|
if err != nil {
|
|
req.respond(err)
|
|
continue
|
|
}
|
|
|
|
// Start a snapshot
|
|
start := time.Now()
|
|
snap, err := r.fsm.Snapshot()
|
|
metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start)
|
|
|
|
// Respond to the request
|
|
req.index = lastIndex
|
|
req.term = lastTerm
|
|
req.peers = peers
|
|
req.snapshot = snap
|
|
req.respond(err)
|
|
|
|
case commitTuple := <-r.fsmCommitCh:
|
|
// Apply the log if a command
|
|
var resp interface{}
|
|
if commitTuple.log.Type == LogCommand {
|
|
start := time.Now()
|
|
resp = r.fsm.Apply(commitTuple.log)
|
|
metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start)
|
|
}
|
|
|
|
// Update the indexes
|
|
lastIndex = commitTuple.log.Index
|
|
lastTerm = commitTuple.log.Term
|
|
|
|
// Invoke the future if given
|
|
if commitTuple.future != nil {
|
|
commitTuple.future.response = resp
|
|
commitTuple.future.respond(nil)
|
|
}
|
|
case <-r.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// run is a long running goroutine that runs the Raft FSM.
|
|
func (r *Raft) run() {
|
|
for {
|
|
// Check if we are doing a shutdown
|
|
select {
|
|
case <-r.shutdownCh:
|
|
// Clear the leader to prevent forwarding
|
|
r.setLeader("")
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Enter into a sub-FSM
|
|
switch r.getState() {
|
|
case Follower:
|
|
r.runFollower()
|
|
case Candidate:
|
|
r.runCandidate()
|
|
case Leader:
|
|
r.runLeader()
|
|
}
|
|
}
|
|
}
|
|
|
|
// runFollower runs the FSM for a follower.
|
|
func (r *Raft) runFollower() {
|
|
didWarn := false
|
|
r.logger.Printf("[INFO] raft: %v entering Follower state", r)
|
|
metrics.IncrCounter([]string{"raft", "state", "follower"}, 1)
|
|
heartbeatTimer := randomTimeout(r.conf.HeartbeatTimeout)
|
|
for {
|
|
select {
|
|
case rpc := <-r.rpcCh:
|
|
r.processRPC(rpc)
|
|
|
|
case a := <-r.applyCh:
|
|
// Reject any operations since we are not the leader
|
|
a.respond(ErrNotLeader)
|
|
|
|
case v := <-r.verifyCh:
|
|
// Reject any operations since we are not the leader
|
|
v.respond(ErrNotLeader)
|
|
|
|
case p := <-r.peerCh:
|
|
// Set the peers
|
|
r.peers = ExcludePeer(p.peers, r.localAddr)
|
|
p.respond(r.peerStore.SetPeers(p.peers))
|
|
|
|
case <-heartbeatTimer:
|
|
// Restart the heartbeat timer
|
|
heartbeatTimer = randomTimeout(r.conf.HeartbeatTimeout)
|
|
|
|
// Check if we have had a successful contact
|
|
lastContact := r.LastContact()
|
|
if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout {
|
|
continue
|
|
}
|
|
|
|
// Heartbeat failed! Transition to the candidate state
|
|
r.setLeader("")
|
|
if len(r.peers) == 0 && !r.conf.EnableSingleNode {
|
|
if !didWarn {
|
|
r.logger.Printf("[WARN] raft: EnableSingleNode disabled, and no known peers. Aborting election.")
|
|
didWarn = true
|
|
}
|
|
} else {
|
|
r.logger.Printf("[WARN] raft: Heartbeat timeout reached, starting election")
|
|
|
|
metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timout"}, 1)
|
|
r.setState(Candidate)
|
|
return
|
|
}
|
|
|
|
case <-r.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// runCandidate runs the FSM for a candidate.
|
|
func (r *Raft) runCandidate() {
|
|
r.logger.Printf("[INFO] raft: %v entering Candidate state", r)
|
|
metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1)
|
|
|
|
// Start vote for us, and set a timeout
|
|
voteCh := r.electSelf()
|
|
electionTimer := randomTimeout(r.conf.ElectionTimeout)
|
|
|
|
// Tally the votes, need a simple majority
|
|
grantedVotes := 0
|
|
votesNeeded := r.quorumSize()
|
|
r.logger.Printf("[DEBUG] raft: Votes needed: %d", votesNeeded)
|
|
|
|
for r.getState() == Candidate {
|
|
select {
|
|
case rpc := <-r.rpcCh:
|
|
r.processRPC(rpc)
|
|
|
|
case vote := <-voteCh:
|
|
// Check if the term is greater than ours, bail
|
|
if vote.Term > r.getCurrentTerm() {
|
|
r.logger.Printf("[DEBUG] raft: Newer term discovered, fallback to follower")
|
|
r.setState(Follower)
|
|
r.setCurrentTerm(vote.Term)
|
|
return
|
|
}
|
|
|
|
// Check if the vote is granted
|
|
if vote.Granted {
|
|
grantedVotes++
|
|
r.logger.Printf("[DEBUG] raft: Vote granted from %s. Tally: %d", vote.voter, grantedVotes)
|
|
}
|
|
|
|
// Check if we've become the leader
|
|
if grantedVotes >= votesNeeded {
|
|
r.logger.Printf("[INFO] raft: Election won. Tally: %d", grantedVotes)
|
|
r.setState(Leader)
|
|
r.setLeader(r.localAddr)
|
|
return
|
|
}
|
|
|
|
case a := <-r.applyCh:
|
|
// Reject any operations since we are not the leader
|
|
a.respond(ErrNotLeader)
|
|
|
|
case v := <-r.verifyCh:
|
|
// Reject any operations since we are not the leader
|
|
v.respond(ErrNotLeader)
|
|
|
|
case p := <-r.peerCh:
|
|
// Set the peers
|
|
r.peers = ExcludePeer(p.peers, r.localAddr)
|
|
p.respond(r.peerStore.SetPeers(p.peers))
|
|
// Become a follower again
|
|
r.setState(Follower)
|
|
return
|
|
|
|
case <-electionTimer:
|
|
// Election failed! Restart the election. We simply return,
|
|
// which will kick us back into runCandidate
|
|
r.logger.Printf("[WARN] raft: Election timeout reached, restarting election")
|
|
return
|
|
|
|
case <-r.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// runLeader runs the FSM for a leader. Do the setup here and drop into
|
|
// the leaderLoop for the hot loop.
|
|
func (r *Raft) runLeader() {
|
|
r.logger.Printf("[INFO] raft: %v entering Leader state", r)
|
|
metrics.IncrCounter([]string{"raft", "state", "leader"}, 1)
|
|
|
|
// Notify that we are the leader
|
|
asyncNotifyBool(r.leaderCh, true)
|
|
|
|
// Push to the notify channel if given
|
|
if notify := r.conf.NotifyCh; notify != nil {
|
|
select {
|
|
case notify <- true:
|
|
case <-r.shutdownCh:
|
|
}
|
|
}
|
|
|
|
// Setup leader state
|
|
r.leaderState.commitCh = make(chan struct{}, 1)
|
|
r.leaderState.inflight = newInflight(r.leaderState.commitCh)
|
|
r.leaderState.replState = make(map[string]*followerReplication)
|
|
r.leaderState.notify = make(map[*verifyFuture]struct{})
|
|
r.leaderState.stepDown = make(chan struct{}, 1)
|
|
|
|
// Cleanup state on step down
|
|
defer func() {
|
|
// Since we were the leader previously, we update our
|
|
// last contact time when we step down, so that we are not
|
|
// reporting a last contact time from before we were the
|
|
// leader. Otherwise, to a client it would seem our data
|
|
// is extremely stale.
|
|
r.setLastContact()
|
|
|
|
// Stop replication
|
|
for _, p := range r.leaderState.replState {
|
|
close(p.stopCh)
|
|
}
|
|
|
|
// Cancel inflight requests
|
|
r.leaderState.inflight.Cancel(ErrLeadershipLost)
|
|
|
|
// Respond to any pending verify requests
|
|
for future := range r.leaderState.notify {
|
|
future.respond(ErrLeadershipLost)
|
|
}
|
|
|
|
// Clear all the state
|
|
r.leaderState.commitCh = nil
|
|
r.leaderState.inflight = nil
|
|
r.leaderState.replState = nil
|
|
r.leaderState.notify = nil
|
|
r.leaderState.stepDown = nil
|
|
|
|
// If we are stepping down for some reason, no known leader.
|
|
// We may have stepped down due to an RPC call, which would
|
|
// provide the leader, so we cannot always blank this out.
|
|
r.leaderLock.Lock()
|
|
if r.leader == r.localAddr {
|
|
r.leader = ""
|
|
}
|
|
r.leaderLock.Unlock()
|
|
|
|
// Notify that we are not the leader
|
|
asyncNotifyBool(r.leaderCh, false)
|
|
|
|
// Push to the notify channel if given
|
|
if notify := r.conf.NotifyCh; notify != nil {
|
|
select {
|
|
case notify <- false:
|
|
case <-r.shutdownCh:
|
|
// On shutdown, make a best effort but do not block
|
|
select {
|
|
case notify <- false:
|
|
default:
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
|
|
// Start a replication routine for each peer
|
|
for _, peer := range r.peers {
|
|
r.startReplication(peer)
|
|
}
|
|
|
|
// Dispatch a no-op log first. Instead of LogNoop,
|
|
// we use a LogAddPeer with our peerset. This acts like
|
|
// a no-op as well, but when doing an initial bootstrap, ensures
|
|
// that all nodes share a common peerset.
|
|
peerSet := append([]string{r.localAddr}, r.peers...)
|
|
noop := &logFuture{
|
|
log: Log{
|
|
Type: LogAddPeer,
|
|
Data: encodePeers(peerSet, r.trans),
|
|
},
|
|
}
|
|
r.dispatchLogs([]*logFuture{noop})
|
|
|
|
// Disable EnableSingleNode after we've been elected leader.
|
|
// This is to prevent a split brain in the future, if we are removed
|
|
// from the cluster and then elect ourself as leader.
|
|
if r.conf.DisableBootstrapAfterElect && r.conf.EnableSingleNode {
|
|
r.logger.Printf("[INFO] raft: Disabling EnableSingleNode (bootstrap)")
|
|
r.conf.EnableSingleNode = false
|
|
}
|
|
|
|
// Sit in the leader loop until we step down
|
|
r.leaderLoop()
|
|
}
|
|
|
|
// startReplication is a helper to setup state and start async replication to a peer.
|
|
func (r *Raft) startReplication(peer string) {
|
|
lastIdx := r.getLastIndex()
|
|
s := &followerReplication{
|
|
peer: peer,
|
|
inflight: r.leaderState.inflight,
|
|
stopCh: make(chan uint64, 1),
|
|
triggerCh: make(chan struct{}, 1),
|
|
currentTerm: r.getCurrentTerm(),
|
|
matchIndex: 0,
|
|
nextIndex: lastIdx + 1,
|
|
lastContact: time.Now(),
|
|
notifyCh: make(chan struct{}, 1),
|
|
stepDown: r.leaderState.stepDown,
|
|
}
|
|
r.leaderState.replState[peer] = s
|
|
r.goFunc(func() { r.replicate(s) })
|
|
asyncNotifyCh(s.triggerCh)
|
|
}
|
|
|
|
// leaderLoop is the hot loop for a leader. It is invoked
|
|
// after all the various leader setup is done.
|
|
func (r *Raft) leaderLoop() {
|
|
// stepDown is used to track if there is an inflight log that
|
|
// would cause us to lose leadership (specifically a RemovePeer of
|
|
// ourselves). If this is the case, we must not allow any logs to
|
|
// be processed in parallel, otherwise we are basing commit on
|
|
// only a single peer (ourself) and replicating to an undefined set
|
|
// of peers.
|
|
stepDown := false
|
|
|
|
lease := time.After(r.conf.LeaderLeaseTimeout)
|
|
for r.getState() == Leader {
|
|
select {
|
|
case rpc := <-r.rpcCh:
|
|
r.processRPC(rpc)
|
|
|
|
case <-r.leaderState.stepDown:
|
|
r.setState(Follower)
|
|
|
|
case <-r.leaderState.commitCh:
|
|
// Get the committed messages
|
|
committed := r.leaderState.inflight.Committed()
|
|
for e := committed.Front(); e != nil; e = e.Next() {
|
|
// Measure the commit time
|
|
commitLog := e.Value.(*logFuture)
|
|
metrics.MeasureSince([]string{"raft", "commitTime"}, commitLog.dispatch)
|
|
|
|
// Increment the commit index
|
|
idx := commitLog.log.Index
|
|
r.setCommitIndex(idx)
|
|
r.processLogs(idx, commitLog)
|
|
}
|
|
|
|
case v := <-r.verifyCh:
|
|
if v.quorumSize == 0 {
|
|
// Just dispatched, start the verification
|
|
r.verifyLeader(v)
|
|
|
|
} else if v.votes < v.quorumSize {
|
|
// Early return, means there must be a new leader
|
|
r.logger.Printf("[WARN] raft: New leader elected, stepping down")
|
|
r.setState(Follower)
|
|
delete(r.leaderState.notify, v)
|
|
v.respond(ErrNotLeader)
|
|
|
|
} else {
|
|
// Quorum of members agree, we are still leader
|
|
delete(r.leaderState.notify, v)
|
|
v.respond(nil)
|
|
}
|
|
|
|
case p := <-r.peerCh:
|
|
p.respond(ErrLeader)
|
|
|
|
case newLog := <-r.applyCh:
|
|
// Group commit, gather all the ready commits
|
|
ready := []*logFuture{newLog}
|
|
for i := 0; i < r.conf.MaxAppendEntries; i++ {
|
|
select {
|
|
case newLog := <-r.applyCh:
|
|
ready = append(ready, newLog)
|
|
default:
|
|
break
|
|
}
|
|
}
|
|
|
|
// Handle any peer set changes
|
|
n := len(ready)
|
|
for i := 0; i < n; i++ {
|
|
// Fail all future transactions once stepDown is on
|
|
if stepDown {
|
|
ready[i].respond(ErrNotLeader)
|
|
ready[i], ready[n-1] = ready[n-1], nil
|
|
n--
|
|
i--
|
|
continue
|
|
}
|
|
|
|
// Special case AddPeer and RemovePeer
|
|
log := ready[i]
|
|
if log.log.Type != LogAddPeer && log.log.Type != LogRemovePeer {
|
|
continue
|
|
}
|
|
|
|
// Check if this log should be ignored. The logs can be
|
|
// reordered here since we have not yet assigned an index
|
|
// and are not violating any promises.
|
|
if !r.preparePeerChange(log) {
|
|
ready[i], ready[n-1] = ready[n-1], nil
|
|
n--
|
|
i--
|
|
continue
|
|
}
|
|
|
|
// Apply peer set changes early and check if we will step
|
|
// down after the commit of this log. If so, we must not
|
|
// allow any future entries to make progress to avoid undefined
|
|
// behavior.
|
|
if ok := r.processLog(&log.log, nil, true); ok {
|
|
stepDown = true
|
|
}
|
|
}
|
|
|
|
// Nothing to do if all logs are invalid
|
|
if n == 0 {
|
|
continue
|
|
}
|
|
|
|
// Dispatch the logs
|
|
ready = ready[:n]
|
|
r.dispatchLogs(ready)
|
|
|
|
case <-lease:
|
|
// Check if we've exceeded the lease, potentially stepping down
|
|
maxDiff := r.checkLeaderLease()
|
|
|
|
// Next check interval should adjust for the last node we've
|
|
// contacted, without going negative
|
|
checkInterval := r.conf.LeaderLeaseTimeout - maxDiff
|
|
if checkInterval < minCheckInterval {
|
|
checkInterval = minCheckInterval
|
|
}
|
|
|
|
// Renew the lease timer
|
|
lease = time.After(checkInterval)
|
|
|
|
case <-r.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// verifyLeader must be called from the main thread for safety.
|
|
// Causes the followers to attempt an immediate heartbeat.
|
|
func (r *Raft) verifyLeader(v *verifyFuture) {
|
|
// Current leader always votes for self
|
|
v.votes = 1
|
|
|
|
// Set the quorum size, hot-path for single node
|
|
v.quorumSize = r.quorumSize()
|
|
if v.quorumSize == 1 {
|
|
v.respond(nil)
|
|
return
|
|
}
|
|
|
|
// Track this request
|
|
v.notifyCh = r.verifyCh
|
|
r.leaderState.notify[v] = struct{}{}
|
|
|
|
// Trigger immediate heartbeats
|
|
for _, repl := range r.leaderState.replState {
|
|
repl.notifyLock.Lock()
|
|
repl.notify = append(repl.notify, v)
|
|
repl.notifyLock.Unlock()
|
|
asyncNotifyCh(repl.notifyCh)
|
|
}
|
|
}
|
|
|
|
// checkLeaderLease is used to check if we can contact a quorum of nodes
|
|
// within the last leader lease interval. If not, we need to step down,
|
|
// as we may have lost connectivity. Returns the maximum duration without
|
|
// contact.
|
|
func (r *Raft) checkLeaderLease() time.Duration {
|
|
// Track contacted nodes, we can always contact ourself
|
|
contacted := 1
|
|
|
|
// Check each follower
|
|
var maxDiff time.Duration
|
|
now := time.Now()
|
|
for peer, f := range r.leaderState.replState {
|
|
diff := now.Sub(f.LastContact())
|
|
if diff <= r.conf.LeaderLeaseTimeout {
|
|
contacted++
|
|
if diff > maxDiff {
|
|
maxDiff = diff
|
|
}
|
|
} else {
|
|
// Log at least once at high value, then debug. Otherwise it gets very verbose.
|
|
if diff <= 3*r.conf.LeaderLeaseTimeout {
|
|
r.logger.Printf("[WARN] raft: Failed to contact %v in %v", peer, diff)
|
|
} else {
|
|
r.logger.Printf("[DEBUG] raft: Failed to contact %v in %v", peer, diff)
|
|
}
|
|
}
|
|
metrics.AddSample([]string{"raft", "leader", "lastContact"}, float32(diff/time.Millisecond))
|
|
}
|
|
|
|
// Verify we can contact a quorum
|
|
quorum := r.quorumSize()
|
|
if contacted < quorum {
|
|
r.logger.Printf("[WARN] raft: Failed to contact quorum of nodes, stepping down")
|
|
r.setState(Follower)
|
|
metrics.IncrCounter([]string{"raft", "transition", "leader_lease_timeout"}, 1)
|
|
}
|
|
return maxDiff
|
|
}
|
|
|
|
// quorumSize is used to return the quorum size
|
|
func (r *Raft) quorumSize() int {
|
|
return ((len(r.peers) + 1) / 2) + 1
|
|
}
|
|
|
|
// preparePeerChange checks if a LogAddPeer or LogRemovePeer should be performed,
|
|
// and properly formats the data field on the log before dispatching it.
|
|
func (r *Raft) preparePeerChange(l *logFuture) bool {
|
|
// Check if this is a known peer
|
|
p := l.log.peer
|
|
knownPeer := PeerContained(r.peers, p) || r.localAddr == p
|
|
|
|
// Ignore known peers on add
|
|
if l.log.Type == LogAddPeer && knownPeer {
|
|
l.respond(ErrKnownPeer)
|
|
return false
|
|
}
|
|
|
|
// Ignore unknown peers on remove
|
|
if l.log.Type == LogRemovePeer && !knownPeer {
|
|
l.respond(ErrUnknownPeer)
|
|
return false
|
|
}
|
|
|
|
// Construct the peer set
|
|
var peerSet []string
|
|
if l.log.Type == LogAddPeer {
|
|
peerSet = append([]string{p, r.localAddr}, r.peers...)
|
|
} else {
|
|
peerSet = ExcludePeer(append([]string{r.localAddr}, r.peers...), p)
|
|
}
|
|
|
|
// Setup the log
|
|
l.log.Data = encodePeers(peerSet, r.trans)
|
|
return true
|
|
}
|
|
|
|
// dispatchLog is called to push a log to disk, mark it
|
|
// as inflight and begin replication of it.
|
|
func (r *Raft) dispatchLogs(applyLogs []*logFuture) {
|
|
now := time.Now()
|
|
defer metrics.MeasureSince([]string{"raft", "leader", "dispatchLog"}, now)
|
|
|
|
term := r.getCurrentTerm()
|
|
lastIndex := r.getLastIndex()
|
|
logs := make([]*Log, len(applyLogs))
|
|
|
|
for idx, applyLog := range applyLogs {
|
|
applyLog.dispatch = now
|
|
applyLog.log.Index = lastIndex + uint64(idx) + 1
|
|
applyLog.log.Term = term
|
|
applyLog.policy = newMajorityQuorum(len(r.peers) + 1)
|
|
logs[idx] = &applyLog.log
|
|
}
|
|
|
|
// Write the log entry locally
|
|
if err := r.logs.StoreLogs(logs); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to commit logs: %v", err)
|
|
for _, applyLog := range applyLogs {
|
|
applyLog.respond(err)
|
|
}
|
|
r.setState(Follower)
|
|
return
|
|
}
|
|
|
|
// Add this to the inflight logs, commit
|
|
r.leaderState.inflight.StartAll(applyLogs)
|
|
|
|
// Update the last log since it's on disk now
|
|
r.setLastLogIndex(lastIndex + uint64(len(applyLogs)))
|
|
r.setLastLogTerm(term)
|
|
|
|
// Notify the replicators of the new log
|
|
for _, f := range r.leaderState.replState {
|
|
asyncNotifyCh(f.triggerCh)
|
|
}
|
|
}
|
|
|
|
// processLogs is used to process all the logs from the lastApplied
|
|
// up to the given index.
|
|
func (r *Raft) processLogs(index uint64, future *logFuture) {
|
|
// Reject logs we've applied already
|
|
lastApplied := r.getLastApplied()
|
|
if index <= lastApplied {
|
|
r.logger.Printf("[WARN] raft: Skipping application of old log: %d", index)
|
|
return
|
|
}
|
|
|
|
// Apply all the preceding logs
|
|
for idx := r.getLastApplied() + 1; idx <= index; idx++ {
|
|
// Get the log, either from the future or from our log store
|
|
if future != nil && future.log.Index == idx {
|
|
r.processLog(&future.log, future, false)
|
|
|
|
} else {
|
|
l := new(Log)
|
|
if err := r.logs.GetLog(idx, l); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", idx, err)
|
|
panic(err)
|
|
}
|
|
r.processLog(l, nil, false)
|
|
}
|
|
|
|
// Update the lastApplied index and term
|
|
r.setLastApplied(idx)
|
|
}
|
|
}
|
|
|
|
// processLog is invoked to process the application of a single committed log.
|
|
// Returns if this log entry would cause us to stepDown after it commits.
|
|
func (r *Raft) processLog(l *Log, future *logFuture, precommit bool) (stepDown bool) {
|
|
switch l.Type {
|
|
case LogBarrier:
|
|
// Barrier is handled by the FSM
|
|
fallthrough
|
|
|
|
case LogCommand:
|
|
// Forward to the fsm handler
|
|
select {
|
|
case r.fsmCommitCh <- commitTuple{l, future}:
|
|
case <-r.shutdownCh:
|
|
if future != nil {
|
|
future.respond(ErrRaftShutdown)
|
|
}
|
|
}
|
|
|
|
// Return so that the future is only responded to
|
|
// by the FSM handler when the application is done
|
|
return
|
|
|
|
case LogAddPeer:
|
|
fallthrough
|
|
case LogRemovePeer:
|
|
peers := decodePeers(l.Data, r.trans)
|
|
r.logger.Printf("[DEBUG] raft: Node %v updated peer set (%v): %v", r.localAddr, l.Type, peers)
|
|
|
|
// If the peer set does not include us, remove all other peers
|
|
removeSelf := !PeerContained(peers, r.localAddr) && l.Type == LogRemovePeer
|
|
if removeSelf {
|
|
// Mark that this operation will cause us to step down as
|
|
// leader. This prevents the future logs from being Applied
|
|
// from this leader.
|
|
stepDown = true
|
|
|
|
// We only modify the peers after the commit, otherwise we
|
|
// would be using a quorum size of 1 for the RemovePeer operation.
|
|
// This is used with the stepDown guard to prevent any other logs.
|
|
if !precommit {
|
|
r.peers = nil
|
|
r.peerStore.SetPeers([]string{r.localAddr})
|
|
}
|
|
} else {
|
|
r.peers = ExcludePeer(peers, r.localAddr)
|
|
r.peerStore.SetPeers(peers)
|
|
}
|
|
|
|
// Handle replication if we are the leader
|
|
if r.getState() == Leader {
|
|
for _, p := range r.peers {
|
|
if _, ok := r.leaderState.replState[p]; !ok {
|
|
r.logger.Printf("[INFO] raft: Added peer %v, starting replication", p)
|
|
r.startReplication(p)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stop replication for old nodes
|
|
if r.getState() == Leader && !precommit {
|
|
var toDelete []string
|
|
for _, repl := range r.leaderState.replState {
|
|
if !PeerContained(r.peers, repl.peer) {
|
|
r.logger.Printf("[INFO] raft: Removed peer %v, stopping replication (Index: %d)", repl.peer, l.Index)
|
|
|
|
// Replicate up to this index and stop
|
|
repl.stopCh <- l.Index
|
|
close(repl.stopCh)
|
|
toDelete = append(toDelete, repl.peer)
|
|
}
|
|
}
|
|
for _, name := range toDelete {
|
|
delete(r.leaderState.replState, name)
|
|
}
|
|
}
|
|
|
|
// Handle removing ourself
|
|
if removeSelf && !precommit {
|
|
if r.conf.ShutdownOnRemove {
|
|
r.logger.Printf("[INFO] raft: Removed ourself, shutting down")
|
|
r.Shutdown()
|
|
} else {
|
|
r.logger.Printf("[INFO] raft: Removed ourself, transitioning to follower")
|
|
r.setState(Follower)
|
|
}
|
|
}
|
|
|
|
case LogNoop:
|
|
// Ignore the no-op
|
|
default:
|
|
r.logger.Printf("[ERR] raft: Got unrecognized log type: %#v", l)
|
|
}
|
|
|
|
// Invoke the future if given
|
|
if future != nil && !precommit {
|
|
future.respond(nil)
|
|
}
|
|
return
|
|
}
|
|
|
|
// processRPC is called to handle an incoming RPC request.
|
|
func (r *Raft) processRPC(rpc RPC) {
|
|
switch cmd := rpc.Command.(type) {
|
|
case *AppendEntriesRequest:
|
|
r.appendEntries(rpc, cmd)
|
|
case *RequestVoteRequest:
|
|
r.requestVote(rpc, cmd)
|
|
case *InstallSnapshotRequest:
|
|
r.installSnapshot(rpc, cmd)
|
|
default:
|
|
r.logger.Printf("[ERR] raft: Got unexpected command: %#v", rpc.Command)
|
|
rpc.Respond(nil, fmt.Errorf("unexpected command"))
|
|
}
|
|
}
|
|
|
|
// processHeartbeat is a special handler used just for heartbeat requests
|
|
// so that they can be fast-pathed if a transport supports it.
|
|
func (r *Raft) processHeartbeat(rpc RPC) {
|
|
defer metrics.MeasureSince([]string{"raft", "rpc", "processHeartbeat"}, time.Now())
|
|
|
|
// Check if we are shutdown, just ignore the RPC
|
|
select {
|
|
case <-r.shutdownCh:
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Ensure we are only handling a heartbeat
|
|
switch cmd := rpc.Command.(type) {
|
|
case *AppendEntriesRequest:
|
|
r.appendEntries(rpc, cmd)
|
|
default:
|
|
r.logger.Printf("[ERR] raft: Expected heartbeat, got command: %#v", rpc.Command)
|
|
rpc.Respond(nil, fmt.Errorf("unexpected command"))
|
|
}
|
|
}
|
|
|
|
// appendEntries is invoked when we get an append entries RPC call.
|
|
func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) {
|
|
defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now())
|
|
// Setup a response
|
|
resp := &AppendEntriesResponse{
|
|
Term: r.getCurrentTerm(),
|
|
LastLog: r.getLastIndex(),
|
|
Success: false,
|
|
NoRetryBackoff: false,
|
|
}
|
|
var rpcErr error
|
|
defer func() {
|
|
rpc.Respond(resp, rpcErr)
|
|
}()
|
|
|
|
// Ignore an older term
|
|
if a.Term < r.getCurrentTerm() {
|
|
return
|
|
}
|
|
|
|
// Increase the term if we see a newer one, also transition to follower
|
|
// if we ever get an appendEntries call
|
|
if a.Term > r.getCurrentTerm() || r.getState() != Follower {
|
|
// Ensure transition to follower
|
|
r.setState(Follower)
|
|
r.setCurrentTerm(a.Term)
|
|
resp.Term = a.Term
|
|
}
|
|
|
|
// Save the current leader
|
|
r.setLeader(r.trans.DecodePeer(a.Leader))
|
|
|
|
// Verify the last log entry
|
|
if a.PrevLogEntry > 0 {
|
|
lastIdx, lastTerm := r.getLastEntry()
|
|
|
|
var prevLogTerm uint64
|
|
if a.PrevLogEntry == lastIdx {
|
|
prevLogTerm = lastTerm
|
|
|
|
} else {
|
|
var prevLog Log
|
|
if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil {
|
|
r.logger.Printf("[WARN] raft: Failed to get previous log: %d %v (last: %d)",
|
|
a.PrevLogEntry, err, lastIdx)
|
|
resp.NoRetryBackoff = true
|
|
return
|
|
}
|
|
prevLogTerm = prevLog.Term
|
|
}
|
|
|
|
if a.PrevLogTerm != prevLogTerm {
|
|
r.logger.Printf("[WARN] raft: Previous log term mis-match: ours: %d remote: %d",
|
|
prevLogTerm, a.PrevLogTerm)
|
|
resp.NoRetryBackoff = true
|
|
return
|
|
}
|
|
}
|
|
|
|
// Process any new entries
|
|
if n := len(a.Entries); n > 0 {
|
|
start := time.Now()
|
|
first := a.Entries[0]
|
|
last := a.Entries[n-1]
|
|
|
|
// Delete any conflicting entries
|
|
lastLogIdx := r.getLastLogIndex()
|
|
if first.Index <= lastLogIdx {
|
|
r.logger.Printf("[WARN] raft: Clearing log suffix from %d to %d", first.Index, lastLogIdx)
|
|
if err := r.logs.DeleteRange(first.Index, lastLogIdx); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to clear log suffix: %v", err)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Append the entry
|
|
if err := r.logs.StoreLogs(a.Entries); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to append to logs: %v", err)
|
|
return
|
|
}
|
|
|
|
// Update the lastLog
|
|
r.setLastLogIndex(last.Index)
|
|
r.setLastLogTerm(last.Term)
|
|
metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start)
|
|
}
|
|
|
|
// Update the commit index
|
|
if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() {
|
|
start := time.Now()
|
|
idx := min(a.LeaderCommitIndex, r.getLastIndex())
|
|
r.setCommitIndex(idx)
|
|
r.processLogs(idx, nil)
|
|
metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start)
|
|
}
|
|
|
|
// Everything went well, set success
|
|
resp.Success = true
|
|
r.setLastContact()
|
|
return
|
|
}
|
|
|
|
// requestVote is invoked when we get an request vote RPC call.
|
|
func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
|
|
defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now())
|
|
// Setup a response
|
|
resp := &RequestVoteResponse{
|
|
Term: r.getCurrentTerm(),
|
|
Peers: encodePeers(r.peers, r.trans),
|
|
Granted: false,
|
|
}
|
|
var rpcErr error
|
|
defer func() {
|
|
rpc.Respond(resp, rpcErr)
|
|
}()
|
|
|
|
// Check if we have an existing leader [who's not the candidate]
|
|
candidate := r.trans.DecodePeer(req.Candidate)
|
|
if leader := r.Leader(); leader != "" && leader != candidate {
|
|
r.logger.Printf("[WARN] raft: Rejecting vote request from %v since we have a leader: %v",
|
|
candidate, leader)
|
|
return
|
|
}
|
|
|
|
// Ignore an older term
|
|
if req.Term < r.getCurrentTerm() {
|
|
return
|
|
}
|
|
|
|
// Increase the term if we see a newer one
|
|
if req.Term > r.getCurrentTerm() {
|
|
// Ensure transition to follower
|
|
r.setState(Follower)
|
|
r.setCurrentTerm(req.Term)
|
|
resp.Term = req.Term
|
|
}
|
|
|
|
// Check if we have voted yet
|
|
lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm)
|
|
if err != nil && err.Error() != "not found" {
|
|
r.logger.Printf("[ERR] raft: Failed to get last vote term: %v", err)
|
|
return
|
|
}
|
|
lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand)
|
|
if err != nil && err.Error() != "not found" {
|
|
r.logger.Printf("[ERR] raft: Failed to get last vote candidate: %v", err)
|
|
return
|
|
}
|
|
|
|
// Check if we've voted in this election before
|
|
if lastVoteTerm == req.Term && lastVoteCandBytes != nil {
|
|
r.logger.Printf("[INFO] raft: Duplicate RequestVote for same term: %d", req.Term)
|
|
if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 {
|
|
r.logger.Printf("[WARN] raft: Duplicate RequestVote from candidate: %s", req.Candidate)
|
|
resp.Granted = true
|
|
}
|
|
return
|
|
}
|
|
|
|
// Reject if their term is older
|
|
lastIdx, lastTerm := r.getLastEntry()
|
|
if lastTerm > req.LastLogTerm {
|
|
r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last term is greater (%d, %d)",
|
|
candidate, lastTerm, req.LastLogTerm)
|
|
return
|
|
}
|
|
|
|
if lastIdx > req.LastLogIndex {
|
|
r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last index is greater (%d, %d)",
|
|
candidate, lastIdx, req.LastLogIndex)
|
|
return
|
|
}
|
|
|
|
// Persist a vote for safety
|
|
if err := r.persistVote(req.Term, req.Candidate); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to persist vote: %v", err)
|
|
return
|
|
}
|
|
|
|
resp.Granted = true
|
|
return
|
|
}
|
|
|
|
// installSnapshot is invoked when we get a InstallSnapshot RPC call.
|
|
// We must be in the follower state for this, since it means we are
|
|
// too far behind a leader for log replay.
|
|
func (r *Raft) installSnapshot(rpc RPC, req *InstallSnapshotRequest) {
|
|
defer metrics.MeasureSince([]string{"raft", "rpc", "installSnapshot"}, time.Now())
|
|
// Setup a response
|
|
resp := &InstallSnapshotResponse{
|
|
Term: r.getCurrentTerm(),
|
|
Success: false,
|
|
}
|
|
var rpcErr error
|
|
defer func() {
|
|
rpc.Respond(resp, rpcErr)
|
|
}()
|
|
|
|
// Ignore an older term
|
|
if req.Term < r.getCurrentTerm() {
|
|
return
|
|
}
|
|
|
|
// Increase the term if we see a newer one
|
|
if req.Term > r.getCurrentTerm() {
|
|
// Ensure transition to follower
|
|
r.setState(Follower)
|
|
r.setCurrentTerm(req.Term)
|
|
resp.Term = req.Term
|
|
}
|
|
|
|
// Save the current leader
|
|
r.setLeader(r.trans.DecodePeer(req.Leader))
|
|
|
|
// Create a new snapshot
|
|
sink, err := r.snapshots.Create(req.LastLogIndex, req.LastLogTerm, req.Peers)
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to create snapshot to install: %v", err)
|
|
rpcErr = fmt.Errorf("failed to create snapshot: %v", err)
|
|
return
|
|
}
|
|
|
|
// Spill the remote snapshot to disk
|
|
n, err := io.Copy(sink, rpc.Reader)
|
|
if err != nil {
|
|
sink.Cancel()
|
|
r.logger.Printf("[ERR] raft: Failed to copy snapshot: %v", err)
|
|
rpcErr = err
|
|
return
|
|
}
|
|
|
|
// Check that we received it all
|
|
if n != req.Size {
|
|
sink.Cancel()
|
|
r.logger.Printf("[ERR] raft: Failed to receive whole snapshot: %d / %d", n, req.Size)
|
|
rpcErr = fmt.Errorf("short read")
|
|
return
|
|
}
|
|
|
|
// Finalize the snapshot
|
|
if err := sink.Close(); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to finalize snapshot: %v", err)
|
|
rpcErr = err
|
|
return
|
|
}
|
|
r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n)
|
|
|
|
// Restore snapshot
|
|
future := &restoreFuture{ID: sink.ID()}
|
|
future.init()
|
|
select {
|
|
case r.fsmRestoreCh <- future:
|
|
case <-r.shutdownCh:
|
|
future.respond(ErrRaftShutdown)
|
|
return
|
|
}
|
|
|
|
// Wait for the restore to happen
|
|
if err := future.Error(); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to restore snapshot: %v", err)
|
|
rpcErr = err
|
|
return
|
|
}
|
|
|
|
// Update the lastApplied so we don't replay old logs
|
|
r.setLastApplied(req.LastLogIndex)
|
|
|
|
// Update the last stable snapshot info
|
|
r.setLastSnapshotIndex(req.LastLogIndex)
|
|
r.setLastSnapshotTerm(req.LastLogTerm)
|
|
|
|
// Restore the peer set
|
|
peers := decodePeers(req.Peers, r.trans)
|
|
r.peers = ExcludePeer(peers, r.localAddr)
|
|
r.peerStore.SetPeers(peers)
|
|
|
|
// Compact logs, continue even if this fails
|
|
if err := r.compactLogs(req.LastLogIndex); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to compact logs: %v", err)
|
|
}
|
|
|
|
r.logger.Printf("[INFO] raft: Installed remote snapshot")
|
|
resp.Success = true
|
|
r.setLastContact()
|
|
return
|
|
}
|
|
|
|
// setLastContact is used to set the last contact time to now
|
|
func (r *Raft) setLastContact() {
|
|
r.lastContactLock.Lock()
|
|
r.lastContact = time.Now()
|
|
r.lastContactLock.Unlock()
|
|
}
|
|
|
|
type voteResult struct {
|
|
RequestVoteResponse
|
|
voter string
|
|
}
|
|
|
|
// electSelf is used to send a RequestVote RPC to all peers,
|
|
// and vote for ourself. This has the side affecting of incrementing
|
|
// the current term. The response channel returned is used to wait
|
|
// for all the responses (including a vote for ourself).
|
|
func (r *Raft) electSelf() <-chan *voteResult {
|
|
// Create a response channel
|
|
respCh := make(chan *voteResult, len(r.peers)+1)
|
|
|
|
// Increment the term
|
|
r.setCurrentTerm(r.getCurrentTerm() + 1)
|
|
|
|
// Construct the request
|
|
lastIdx, lastTerm := r.getLastEntry()
|
|
req := &RequestVoteRequest{
|
|
Term: r.getCurrentTerm(),
|
|
Candidate: r.trans.EncodePeer(r.localAddr),
|
|
LastLogIndex: lastIdx,
|
|
LastLogTerm: lastTerm,
|
|
}
|
|
|
|
// Construct a function to ask for a vote
|
|
askPeer := func(peer string) {
|
|
r.goFunc(func() {
|
|
defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now())
|
|
resp := &voteResult{voter: peer}
|
|
err := r.trans.RequestVote(peer, req, &resp.RequestVoteResponse)
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err)
|
|
resp.Term = req.Term
|
|
resp.Granted = false
|
|
}
|
|
|
|
// If we are not a peer, we could have been removed but failed
|
|
// to receive the log message. OR it could mean an improperly configured
|
|
// cluster. Either way, we should warn
|
|
if err == nil {
|
|
peerSet := decodePeers(resp.Peers, r.trans)
|
|
if !PeerContained(peerSet, r.localAddr) {
|
|
r.logger.Printf("[WARN] raft: Remote peer %v does not have local node %v as a peer",
|
|
peer, r.localAddr)
|
|
}
|
|
}
|
|
|
|
respCh <- resp
|
|
})
|
|
}
|
|
|
|
// For each peer, request a vote
|
|
for _, peer := range r.peers {
|
|
askPeer(peer)
|
|
}
|
|
|
|
// Persist a vote for ourselves
|
|
if err := r.persistVote(req.Term, req.Candidate); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to persist vote : %v", err)
|
|
return nil
|
|
}
|
|
|
|
// Include our own vote
|
|
respCh <- &voteResult{
|
|
RequestVoteResponse: RequestVoteResponse{
|
|
Term: req.Term,
|
|
Granted: true,
|
|
},
|
|
voter: r.localAddr,
|
|
}
|
|
return respCh
|
|
}
|
|
|
|
// persistVote is used to persist our vote for safety.
|
|
func (r *Raft) persistVote(term uint64, candidate []byte) error {
|
|
if err := r.stable.SetUint64(keyLastVoteTerm, term); err != nil {
|
|
return err
|
|
}
|
|
if err := r.stable.Set(keyLastVoteCand, candidate); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setCurrentTerm is used to set the current term in a durable manner.
|
|
func (r *Raft) setCurrentTerm(t uint64) {
|
|
// Persist to disk first
|
|
if err := r.stable.SetUint64(keyCurrentTerm, t); err != nil {
|
|
panic(fmt.Errorf("failed to save current term: %v", err))
|
|
}
|
|
r.raftState.setCurrentTerm(t)
|
|
}
|
|
|
|
// setState is used to update the current state. Any state
|
|
// transition causes the known leader to be cleared. This means
|
|
// that leader should be set only after updating the state.
|
|
func (r *Raft) setState(state RaftState) {
|
|
r.setLeader("")
|
|
r.raftState.setState(state)
|
|
}
|
|
|
|
// runSnapshots is a long running goroutine used to manage taking
|
|
// new snapshots of the FSM. It runs in parallel to the FSM and
|
|
// main goroutines, so that snapshots do not block normal operation.
|
|
func (r *Raft) runSnapshots() {
|
|
for {
|
|
select {
|
|
case <-randomTimeout(r.conf.SnapshotInterval):
|
|
// Check if we should snapshot
|
|
if !r.shouldSnapshot() {
|
|
continue
|
|
}
|
|
|
|
// Trigger a snapshot
|
|
if err := r.takeSnapshot(); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err)
|
|
}
|
|
|
|
case future := <-r.snapshotCh:
|
|
// User-triggered, run immediately
|
|
err := r.takeSnapshot()
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err)
|
|
}
|
|
future.respond(err)
|
|
|
|
case <-r.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// shouldSnapshot checks if we meet the conditions to take
|
|
// a new snapshot.
|
|
func (r *Raft) shouldSnapshot() bool {
|
|
// Check the last snapshot index
|
|
lastSnap := r.getLastSnapshotIndex()
|
|
|
|
// Check the last log index
|
|
lastIdx, err := r.logs.LastIndex()
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to get last log index: %v", err)
|
|
return false
|
|
}
|
|
|
|
// Compare the delta to the threshold
|
|
delta := lastIdx - lastSnap
|
|
return delta >= r.conf.SnapshotThreshold
|
|
}
|
|
|
|
// takeSnapshot is used to take a new snapshot.
|
|
func (r *Raft) takeSnapshot() error {
|
|
defer metrics.MeasureSince([]string{"raft", "snapshot", "takeSnapshot"}, time.Now())
|
|
// Create a snapshot request
|
|
req := &reqSnapshotFuture{}
|
|
req.init()
|
|
|
|
// Wait for dispatch or shutdown
|
|
select {
|
|
case r.fsmSnapshotCh <- req:
|
|
case <-r.shutdownCh:
|
|
return ErrRaftShutdown
|
|
}
|
|
|
|
// Wait until we get a response
|
|
if err := req.Error(); err != nil {
|
|
if err != ErrNothingNewToSnapshot {
|
|
err = fmt.Errorf("failed to start snapshot: %v", err)
|
|
}
|
|
return err
|
|
}
|
|
defer req.snapshot.Release()
|
|
|
|
// Log that we are starting the snapshot
|
|
r.logger.Printf("[INFO] raft: Starting snapshot up to %d", req.index)
|
|
|
|
// Encode the peerset
|
|
peerSet := encodePeers(req.peers, r.trans)
|
|
|
|
// Create a new snapshot
|
|
start := time.Now()
|
|
sink, err := r.snapshots.Create(req.index, req.term, peerSet)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create snapshot: %v", err)
|
|
}
|
|
metrics.MeasureSince([]string{"raft", "snapshot", "create"}, start)
|
|
|
|
// Try to persist the snapshot
|
|
start = time.Now()
|
|
if err := req.snapshot.Persist(sink); err != nil {
|
|
sink.Cancel()
|
|
return fmt.Errorf("failed to persist snapshot: %v", err)
|
|
}
|
|
metrics.MeasureSince([]string{"raft", "snapshot", "persist"}, start)
|
|
|
|
// Close and check for error
|
|
if err := sink.Close(); err != nil {
|
|
return fmt.Errorf("failed to close snapshot: %v", err)
|
|
}
|
|
|
|
// Update the last stable snapshot info
|
|
r.setLastSnapshotIndex(req.index)
|
|
r.setLastSnapshotTerm(req.term)
|
|
|
|
// Compact the logs
|
|
if err := r.compactLogs(req.index); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Log completion
|
|
r.logger.Printf("[INFO] raft: Snapshot to %d complete", req.index)
|
|
return nil
|
|
}
|
|
|
|
// compactLogs takes the last inclusive index of a snapshot
|
|
// and trims the logs that are no longer needed.
|
|
func (r *Raft) compactLogs(snapIdx uint64) error {
|
|
defer metrics.MeasureSince([]string{"raft", "compactLogs"}, time.Now())
|
|
// Determine log ranges to compact
|
|
minLog, err := r.logs.FirstIndex()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get first log index: %v", err)
|
|
}
|
|
|
|
// Check if we have enough logs to truncate
|
|
if r.getLastLogIndex() <= r.conf.TrailingLogs {
|
|
return nil
|
|
}
|
|
|
|
// Truncate up to the end of the snapshot, or `TrailingLogs`
|
|
// back from the head, which ever is further back. This ensures
|
|
// at least `TrailingLogs` entries, but does not allow logs
|
|
// after the snapshot to be removed.
|
|
maxLog := min(snapIdx, r.getLastLogIndex()-r.conf.TrailingLogs)
|
|
|
|
// Log this
|
|
r.logger.Printf("[INFO] raft: Compacting logs from %d to %d", minLog, maxLog)
|
|
|
|
// Compact the logs
|
|
if err := r.logs.DeleteRange(minLog, maxLog); err != nil {
|
|
return fmt.Errorf("log compaction failed: %v", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// restoreSnapshot attempts to restore the latest snapshots, and fails
|
|
// if none of them can be restored. This is called at initialization time,
|
|
// and is completely unsafe to call at any other time.
|
|
func (r *Raft) restoreSnapshot() error {
|
|
snapshots, err := r.snapshots.List()
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err)
|
|
return err
|
|
}
|
|
|
|
// Try to load in order of newest to oldest
|
|
for _, snapshot := range snapshots {
|
|
_, source, err := r.snapshots.Open(snapshot.ID)
|
|
if err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapshot.ID, err)
|
|
continue
|
|
}
|
|
defer source.Close()
|
|
|
|
if err := r.fsm.Restore(source); err != nil {
|
|
r.logger.Printf("[ERR] raft: Failed to restore snapshot %v: %v", snapshot.ID, err)
|
|
continue
|
|
}
|
|
|
|
// Log success
|
|
r.logger.Printf("[INFO] raft: Restored from snapshot %v", snapshot.ID)
|
|
|
|
// Update the lastApplied so we don't replay old logs
|
|
r.setLastApplied(snapshot.Index)
|
|
|
|
// Update the last stable snapshot info
|
|
r.setLastSnapshotIndex(snapshot.Index)
|
|
r.setLastSnapshotTerm(snapshot.Term)
|
|
|
|
// Success!
|
|
return nil
|
|
}
|
|
|
|
// If we had snapshots and failed to load them, its an error
|
|
if len(snapshots) > 0 {
|
|
return fmt.Errorf("failed to load any existing snapshots")
|
|
}
|
|
return nil
|
|
}
|