open-consul/consul/server.go
James Phillips ffcba3df58 Merge pull request #2028 from hashicorp/f-atomic-kv
Adds support for atomic transactions spanning multiple KV entries.
2016-05-15 13:46:05 -07:00

764 lines
21 KiB
Go

package consul
import (
"crypto/tls"
"errors"
"fmt"
"log"
"net"
"net/rpc"
"os"
"path/filepath"
"reflect"
"strconv"
"strings"
"sync"
"time"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/consul/agent"
"github.com/hashicorp/consul/consul/state"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/raft"
"github.com/hashicorp/raft-boltdb"
"github.com/hashicorp/serf/coordinate"
"github.com/hashicorp/serf/serf"
)
// These are the protocol versions that Consul can _understand_. These are
// Consul-level protocol versions, that are used to configure the Serf
// protocol versions.
const (
ProtocolVersionMin uint8 = 1
// Version 3 added support for network coordinates but we kept the
// default protocol version at 2 to ease the transition to this new
// feature. A Consul agent speaking version 2 of the protocol will
// attempt to send its coordinates to a server who understands version
// 3 or greater.
ProtocolVersion2Compatible = 2
ProtocolVersionMax = 3
)
const (
serfLANSnapshot = "serf/local.snapshot"
serfWANSnapshot = "serf/remote.snapshot"
raftState = "raft/"
snapshotsRetained = 2
// serverRPCCache controls how long we keep an idle connection
// open to a server
serverRPCCache = 2 * time.Minute
// serverMaxStreams controls how many idle streams we keep
// open to a server
serverMaxStreams = 64
// raftLogCacheSize is the maximum number of logs to cache in-memory.
// This is used to reduce disk I/O for the recently committed entries.
raftLogCacheSize = 512
// raftRemoveGracePeriod is how long we wait to allow a RemovePeer
// to replicate to gracefully leave the cluster.
raftRemoveGracePeriod = 5 * time.Second
)
// Server is Consul server which manages the service discovery,
// health checking, DC forwarding, Raft, and multiple Serf pools.
type Server struct {
// aclAuthCache is the authoritative ACL cache
aclAuthCache *acl.Cache
// aclCache is the non-authoritative ACL cache.
aclCache *aclCache
// Consul configuration
config *Config
// Connection pool to other consul servers
connPool *ConnPool
// Endpoints holds our RPC endpoints
endpoints endpoints
// eventChLAN is used to receive events from the
// serf cluster in the datacenter
eventChLAN chan serf.Event
// eventChWAN is used to receive events from the
// serf cluster that spans datacenters
eventChWAN chan serf.Event
// fsm is the state machine used with Raft to provide
// strong consistency.
fsm *consulFSM
// Have we attempted to leave the cluster
left bool
// localConsuls is used to track the known consuls
// in the local datacenter. Used to do leader forwarding.
localConsuls map[string]*agent.Server
localLock sync.RWMutex
// Logger uses the provided LogOutput
logger *log.Logger
// The raft instance is used among Consul nodes within the
// DC to protect operations that require strong consistency
raft *raft.Raft
raftLayer *RaftLayer
raftPeers raft.PeerStore
raftStore *raftboltdb.BoltStore
raftTransport *raft.NetworkTransport
raftInmem *raft.InmemStore
// reconcileCh is used to pass events from the serf handler
// into the leader manager, so that the strong state can be
// updated
reconcileCh chan serf.Member
// remoteConsuls is used to track the known consuls in
// remote datacenters. Used to do DC forwarding.
remoteConsuls map[string][]*agent.Server
remoteLock sync.RWMutex
// rpcListener is used to listen for incoming connections
rpcListener net.Listener
rpcServer *rpc.Server
// rpcTLS is the TLS config for incoming TLS requests
rpcTLS *tls.Config
// serfLAN is the Serf cluster maintained inside the DC
// which contains all the DC nodes
serfLAN *serf.Serf
// serfWAN is the Serf cluster maintained between DC's
// which SHOULD only consist of Consul servers
serfWAN *serf.Serf
// sessionTimers track the expiration time of each Session that has
// a TTL. On expiration, a SessionDestroy event will occur, and
// destroy the session via standard session destroy processing
sessionTimers map[string]*time.Timer
sessionTimersLock sync.Mutex
// tombstoneGC is used to track the pending GC invocations
// for the KV tombstones
tombstoneGC *state.TombstoneGC
shutdown bool
shutdownCh chan struct{}
shutdownLock sync.Mutex
}
// Holds the RPC endpoints
type endpoints struct {
Catalog *Catalog
Health *Health
Status *Status
KVS *KVS
Session *Session
Internal *Internal
ACL *ACL
Coordinate *Coordinate
PreparedQuery *PreparedQuery
Txn *Txn
}
// NewServer is used to construct a new Consul server from the
// configuration, potentially returning an error
func NewServer(config *Config) (*Server, error) {
// Check the protocol version
if err := config.CheckVersion(); err != nil {
return nil, err
}
// Check for a data directory!
if config.DataDir == "" && !config.DevMode {
return nil, fmt.Errorf("Config must provide a DataDir")
}
// Sanity check the ACLs
if err := config.CheckACL(); err != nil {
return nil, err
}
// Ensure we have a log output
if config.LogOutput == nil {
config.LogOutput = os.Stderr
}
// Create the tls wrapper for outgoing connections
tlsConf := config.tlsConfig()
tlsWrap, err := tlsConf.OutgoingTLSWrapper()
if err != nil {
return nil, err
}
// Get the incoming tls config
incomingTLS, err := tlsConf.IncomingTLSConfig()
if err != nil {
return nil, err
}
// Create a logger
logger := log.New(config.LogOutput, "", log.LstdFlags)
// Create the tombstone GC
gc, err := state.NewTombstoneGC(config.TombstoneTTL, config.TombstoneTTLGranularity)
if err != nil {
return nil, err
}
// Create server
s := &Server{
config: config,
connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap),
eventChLAN: make(chan serf.Event, 256),
eventChWAN: make(chan serf.Event, 256),
localConsuls: make(map[string]*agent.Server),
logger: logger,
reconcileCh: make(chan serf.Member, 32),
remoteConsuls: make(map[string][]*agent.Server),
rpcServer: rpc.NewServer(),
rpcTLS: incomingTLS,
tombstoneGC: gc,
shutdownCh: make(chan struct{}),
}
// Initialize the authoritative ACL cache
s.aclAuthCache, err = acl.NewCache(aclCacheSize, s.aclFault)
if err != nil {
s.Shutdown()
return nil, fmt.Errorf("Failed to create ACL cache: %v", err)
}
// Set up the non-authoritative ACL cache
if s.aclCache, err = newAclCache(config, logger, s.RPC); err != nil {
s.Shutdown()
return nil, err
}
// Initialize the RPC layer
if err := s.setupRPC(tlsWrap); err != nil {
s.Shutdown()
return nil, fmt.Errorf("Failed to start RPC layer: %v", err)
}
// Initialize the Raft server
if err := s.setupRaft(); err != nil {
s.Shutdown()
return nil, fmt.Errorf("Failed to start Raft: %v", err)
}
// Initialize the lan Serf
s.serfLAN, err = s.setupSerf(config.SerfLANConfig,
s.eventChLAN, serfLANSnapshot, false)
if err != nil {
s.Shutdown()
return nil, fmt.Errorf("Failed to start lan serf: %v", err)
}
go s.lanEventHandler()
// Initialize the wan Serf
s.serfWAN, err = s.setupSerf(config.SerfWANConfig,
s.eventChWAN, serfWANSnapshot, true)
if err != nil {
s.Shutdown()
return nil, fmt.Errorf("Failed to start wan serf: %v", err)
}
go s.wanEventHandler()
// Start listening for RPC requests
go s.listen()
// Start the metrics handlers
go s.sessionStats()
return s, nil
}
// setupSerf is used to setup and initialize a Serf
func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, wan bool) (*serf.Serf, error) {
addr := s.rpcListener.Addr().(*net.TCPAddr)
conf.Init()
if wan {
conf.NodeName = fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Datacenter)
} else {
conf.NodeName = s.config.NodeName
}
conf.Tags["role"] = "consul"
conf.Tags["dc"] = s.config.Datacenter
conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion)
conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin)
conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax)
conf.Tags["build"] = s.config.Build
conf.Tags["port"] = fmt.Sprintf("%d", addr.Port)
if s.config.Bootstrap {
conf.Tags["bootstrap"] = "1"
}
if s.config.BootstrapExpect != 0 {
conf.Tags["expect"] = fmt.Sprintf("%d", s.config.BootstrapExpect)
}
conf.MemberlistConfig.LogOutput = s.config.LogOutput
conf.LogOutput = s.config.LogOutput
conf.EventCh = ch
if !s.config.DevMode {
conf.SnapshotPath = filepath.Join(s.config.DataDir, path)
}
conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion]
conf.RejoinAfterLeave = s.config.RejoinAfterLeave
if wan {
conf.Merge = &wanMergeDelegate{}
} else {
conf.Merge = &lanMergeDelegate{dc: s.config.Datacenter}
}
// Until Consul supports this fully, we disable automatic resolution.
// When enabled, the Serf gossip may just turn off if we are the minority
// node which is rather unexpected.
conf.EnableNameConflictResolution = false
if err := ensurePath(conf.SnapshotPath, false); err != nil {
return nil, err
}
// Plumb down the enable coordinates flag.
conf.DisableCoordinates = s.config.DisableCoordinates
return serf.Create(conf)
}
// setupRaft is used to setup and initialize Raft
func (s *Server) setupRaft() error {
// If we are in bootstrap mode, enable a single node cluster
if s.config.Bootstrap || s.config.DevMode {
s.config.RaftConfig.EnableSingleNode = true
}
// Create the FSM
var err error
s.fsm, err = NewFSM(s.tombstoneGC, s.config.LogOutput)
if err != nil {
return err
}
// Create a transport layer
trans := raft.NewNetworkTransport(s.raftLayer, 3, 10*time.Second, s.config.LogOutput)
s.raftTransport = trans
var log raft.LogStore
var stable raft.StableStore
var snap raft.SnapshotStore
if s.config.DevMode {
store := raft.NewInmemStore()
s.raftInmem = store
stable = store
log = store
snap = raft.NewDiscardSnapshotStore()
s.raftPeers = &raft.StaticPeers{}
} else {
// Create the base raft path
path := filepath.Join(s.config.DataDir, raftState)
if err := ensurePath(path, true); err != nil {
return err
}
// Create the backend raft store for logs and stable storage
store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db"))
if err != nil {
return err
}
s.raftStore = store
stable = store
// Wrap the store in a LogCache to improve performance
cacheStore, err := raft.NewLogCache(raftLogCacheSize, store)
if err != nil {
store.Close()
return err
}
log = cacheStore
// Create the snapshot store
snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput)
if err != nil {
store.Close()
return err
}
snap = snapshots
// Setup the peer store
s.raftPeers = raft.NewJSONPeers(path, trans)
}
// Ensure local host is always included if we are in bootstrap mode
if s.config.Bootstrap {
peerAddrs, err := s.raftPeers.Peers()
if err != nil {
if s.raftStore != nil {
s.raftStore.Close()
}
return err
}
if !raft.PeerContained(peerAddrs, trans.LocalAddr()) {
s.raftPeers.SetPeers(raft.AddUniquePeer(peerAddrs, trans.LocalAddr()))
}
}
// Make sure we set the LogOutput
s.config.RaftConfig.LogOutput = s.config.LogOutput
// Setup the Raft store
s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable,
snap, s.raftPeers, trans)
if err != nil {
if s.raftStore != nil {
s.raftStore.Close()
}
trans.Close()
return err
}
// Start monitoring leadership
go s.monitorLeadership()
return nil
}
// setupRPC is used to setup the RPC listener
func (s *Server) setupRPC(tlsWrap tlsutil.DCWrapper) error {
// Create endpoints
s.endpoints.Status = &Status{s}
s.endpoints.Catalog = &Catalog{s}
s.endpoints.Health = &Health{s}
s.endpoints.KVS = &KVS{s}
s.endpoints.Session = &Session{s}
s.endpoints.Internal = &Internal{s}
s.endpoints.ACL = &ACL{s}
s.endpoints.Coordinate = NewCoordinate(s)
s.endpoints.PreparedQuery = &PreparedQuery{s}
s.endpoints.Txn = &Txn{s}
// Register the handlers
s.rpcServer.Register(s.endpoints.Status)
s.rpcServer.Register(s.endpoints.Catalog)
s.rpcServer.Register(s.endpoints.Health)
s.rpcServer.Register(s.endpoints.KVS)
s.rpcServer.Register(s.endpoints.Session)
s.rpcServer.Register(s.endpoints.Internal)
s.rpcServer.Register(s.endpoints.ACL)
s.rpcServer.Register(s.endpoints.Coordinate)
s.rpcServer.Register(s.endpoints.PreparedQuery)
s.rpcServer.Register(s.endpoints.Txn)
list, err := net.ListenTCP("tcp", s.config.RPCAddr)
if err != nil {
return err
}
s.rpcListener = list
var advertise net.Addr
if s.config.RPCAdvertise != nil {
advertise = s.config.RPCAdvertise
} else {
advertise = s.rpcListener.Addr()
}
// Verify that we have a usable advertise address
addr, ok := advertise.(*net.TCPAddr)
if !ok {
list.Close()
return fmt.Errorf("RPC advertise address is not a TCP Address: %v", addr)
}
if addr.IP.IsUnspecified() {
list.Close()
return fmt.Errorf("RPC advertise address is not advertisable: %v", addr)
}
// Provide a DC specific wrapper. Raft replication is only
// ever done in the same datacenter, so we can provide it as a constant.
wrapper := tlsutil.SpecificDC(s.config.Datacenter, tlsWrap)
s.raftLayer = NewRaftLayer(advertise, wrapper)
return nil
}
// Shutdown is used to shutdown the server
func (s *Server) Shutdown() error {
s.logger.Printf("[INFO] consul: shutting down server")
s.shutdownLock.Lock()
defer s.shutdownLock.Unlock()
if s.shutdown {
return nil
}
s.shutdown = true
close(s.shutdownCh)
if s.serfLAN != nil {
s.serfLAN.Shutdown()
}
if s.serfWAN != nil {
s.serfWAN.Shutdown()
}
if s.raft != nil {
s.raftTransport.Close()
s.raftLayer.Close()
future := s.raft.Shutdown()
if err := future.Error(); err != nil {
s.logger.Printf("[WARN] consul: Error shutting down raft: %s", err)
}
if s.raftStore != nil {
s.raftStore.Close()
}
// Clear the peer set on a graceful leave to avoid
// triggering elections on a rejoin.
if s.left {
s.raftPeers.SetPeers(nil)
}
}
if s.rpcListener != nil {
s.rpcListener.Close()
}
// Close the connection pool
s.connPool.Shutdown()
return nil
}
// Leave is used to prepare for a graceful shutdown of the server
func (s *Server) Leave() error {
s.logger.Printf("[INFO] consul: server starting leave")
s.left = true
// Check the number of known peers
numPeers, err := s.numOtherPeers()
if err != nil {
s.logger.Printf("[ERR] consul: failed to check raft peers: %v", err)
return err
}
// If we are the current leader, and we have any other peers (cluster has multiple
// servers), we should do a RemovePeer to safely reduce the quorum size. If we are
// not the leader, then we should issue our leave intention and wait to be removed
// for some sane period of time.
isLeader := s.IsLeader()
if isLeader && numPeers > 0 {
future := s.raft.RemovePeer(s.raftTransport.LocalAddr())
if err := future.Error(); err != nil && err != raft.ErrUnknownPeer {
s.logger.Printf("[ERR] consul: failed to remove ourself as raft peer: %v", err)
}
}
// Leave the WAN pool
if s.serfWAN != nil {
if err := s.serfWAN.Leave(); err != nil {
s.logger.Printf("[ERR] consul: failed to leave WAN Serf cluster: %v", err)
}
}
// Leave the LAN pool
if s.serfLAN != nil {
if err := s.serfLAN.Leave(); err != nil {
s.logger.Printf("[ERR] consul: failed to leave LAN Serf cluster: %v", err)
}
}
// If we were not leader, wait to be safely removed from the cluster.
// We must wait to allow the raft replication to take place, otherwise
// an immediate shutdown could cause a loss of quorum.
if !isLeader {
limit := time.Now().Add(raftRemoveGracePeriod)
for numPeers > 0 && time.Now().Before(limit) {
// Update the number of peers
numPeers, err = s.numOtherPeers()
if err != nil {
s.logger.Printf("[ERR] consul: failed to check raft peers: %v", err)
break
}
// Avoid the sleep if we are done
if numPeers == 0 {
break
}
// Sleep a while and check again
time.Sleep(50 * time.Millisecond)
}
if numPeers != 0 {
s.logger.Printf("[WARN] consul: failed to leave raft peer set gracefully, timeout")
}
}
return nil
}
// numOtherPeers is used to check on the number of known peers
// excluding the local node
func (s *Server) numOtherPeers() (int, error) {
peers, err := s.raftPeers.Peers()
if err != nil {
return 0, err
}
otherPeers := raft.ExcludePeer(peers, s.raftTransport.LocalAddr())
return len(otherPeers), nil
}
// JoinLAN is used to have Consul join the inner-DC pool
// The target address should be another node inside the DC
// listening on the Serf LAN address
func (s *Server) JoinLAN(addrs []string) (int, error) {
return s.serfLAN.Join(addrs, true)
}
// JoinWAN is used to have Consul join the cross-WAN Consul ring
// The target address should be another node listening on the
// Serf WAN address
func (s *Server) JoinWAN(addrs []string) (int, error) {
return s.serfWAN.Join(addrs, true)
}
// LocalMember is used to return the local node
func (c *Server) LocalMember() serf.Member {
return c.serfLAN.LocalMember()
}
// LANMembers is used to return the members of the LAN cluster
func (s *Server) LANMembers() []serf.Member {
return s.serfLAN.Members()
}
// WANMembers is used to return the members of the LAN cluster
func (s *Server) WANMembers() []serf.Member {
return s.serfWAN.Members()
}
// RemoveFailedNode is used to remove a failed node from the cluster
func (s *Server) RemoveFailedNode(node string) error {
if err := s.serfLAN.RemoveFailedNode(node); err != nil {
return err
}
if err := s.serfWAN.RemoveFailedNode(node); err != nil {
return err
}
return nil
}
// IsLeader checks if this server is the cluster leader
func (s *Server) IsLeader() bool {
return s.raft.State() == raft.Leader
}
// KeyManagerLAN returns the LAN Serf keyring manager
func (s *Server) KeyManagerLAN() *serf.KeyManager {
return s.serfLAN.KeyManager()
}
// KeyManagerWAN returns the WAN Serf keyring manager
func (s *Server) KeyManagerWAN() *serf.KeyManager {
return s.serfWAN.KeyManager()
}
// Encrypted determines if gossip is encrypted
func (s *Server) Encrypted() bool {
return s.serfLAN.EncryptionEnabled() && s.serfWAN.EncryptionEnabled()
}
// inmemCodec is used to do an RPC call without going over a network
type inmemCodec struct {
method string
args interface{}
reply interface{}
err error
}
func (i *inmemCodec) ReadRequestHeader(req *rpc.Request) error {
req.ServiceMethod = i.method
return nil
}
func (i *inmemCodec) ReadRequestBody(args interface{}) error {
sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.args)))
dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(args)))
dst.Set(sourceValue)
return nil
}
func (i *inmemCodec) WriteResponse(resp *rpc.Response, reply interface{}) error {
if resp.Error != "" {
i.err = errors.New(resp.Error)
return nil
}
sourceValue := reflect.Indirect(reflect.Indirect(reflect.ValueOf(reply)))
dst := reflect.Indirect(reflect.Indirect(reflect.ValueOf(i.reply)))
dst.Set(sourceValue)
return nil
}
func (i *inmemCodec) Close() error {
return nil
}
// RPC is used to make a local RPC call
func (s *Server) RPC(method string, args interface{}, reply interface{}) error {
codec := &inmemCodec{
method: method,
args: args,
reply: reply,
}
if err := s.rpcServer.ServeRequest(codec); err != nil {
return err
}
return codec.err
}
// InjectEndpoint is used to substitute an endpoint for testing.
func (s *Server) InjectEndpoint(endpoint interface{}) error {
s.logger.Printf("[WARN] consul: endpoint injected; this should only be used for testing")
return s.rpcServer.Register(endpoint)
}
// Stats is used to return statistics for debugging and insight
// for various sub-systems
func (s *Server) Stats() map[string]map[string]string {
toString := func(v uint64) string {
return strconv.FormatUint(v, 10)
}
stats := map[string]map[string]string{
"consul": map[string]string{
"server": "true",
"leader": fmt.Sprintf("%v", s.IsLeader()),
"leader_addr": s.raft.Leader(),
"bootstrap": fmt.Sprintf("%v", s.config.Bootstrap),
"known_datacenters": toString(uint64(len(s.remoteConsuls))),
},
"raft": s.raft.Stats(),
"serf_lan": s.serfLAN.Stats(),
"serf_wan": s.serfWAN.Stats(),
"runtime": runtimeStats(),
}
if peers, err := s.raftPeers.Peers(); err == nil {
stats["raft"]["raft_peers"] = strings.Join(peers, ",")
} else {
s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err)
}
return stats
}
// GetLANCoordinate returns the coordinate of the server in the LAN gossip pool.
func (s *Server) GetLANCoordinate() (*coordinate.Coordinate, error) {
return s.serfLAN.GetCoordinate()
}
// GetWANCoordinate returns the coordinate of the server in the WAN gossip pool.
func (s *Server) GetWANCoordinate() (*coordinate.Coordinate, error) {
return s.serfWAN.GetCoordinate()
}