2015-06-01 15:49:10 +00:00
|
|
|
package nomad
|
|
|
|
|
|
|
|
import (
|
2015-06-04 10:33:12 +00:00
|
|
|
"fmt"
|
2015-06-05 22:14:08 +00:00
|
|
|
"math/rand"
|
2015-06-03 11:35:48 +00:00
|
|
|
"net"
|
2015-06-01 15:49:10 +00:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
2015-06-03 10:26:50 +00:00
|
|
|
"strconv"
|
2015-06-03 11:35:48 +00:00
|
|
|
|
2019-10-01 20:06:24 +00:00
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
2017-09-07 23:56:15 +00:00
|
|
|
version "github.com/hashicorp/go-version"
|
2018-04-09 17:03:51 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/state"
|
2018-04-06 18:33:08 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2015-06-03 11:35:48 +00:00
|
|
|
"github.com/hashicorp/serf/serf"
|
2015-06-01 15:49:10 +00:00
|
|
|
)
|
|
|
|
|
2019-03-08 11:18:56 +00:00
|
|
|
// MinVersionPlanNormalization is the minimum version to support the
|
|
|
|
// normalization of Plan in SubmitPlan, and the denormalization raft log entry committed
|
2019-03-04 09:49:32 +00:00
|
|
|
// in ApplyPlanResultsRequest
|
2019-04-24 18:01:59 +00:00
|
|
|
var MinVersionPlanNormalization = version.Must(version.NewVersion("0.9.2"))
|
2019-03-04 09:49:32 +00:00
|
|
|
|
2015-06-01 15:49:10 +00:00
|
|
|
// ensurePath is used to make sure a path exists
|
|
|
|
func ensurePath(path string, dir bool) error {
|
|
|
|
if !dir {
|
|
|
|
path = filepath.Dir(path)
|
|
|
|
}
|
|
|
|
return os.MkdirAll(path, 0755)
|
|
|
|
}
|
2015-06-03 10:26:50 +00:00
|
|
|
|
2015-06-03 11:35:48 +00:00
|
|
|
// serverParts is used to return the parts of a server role
|
|
|
|
type serverParts struct {
|
2016-05-28 01:14:34 +00:00
|
|
|
Name string
|
2017-11-22 00:29:11 +00:00
|
|
|
ID string
|
2016-05-28 01:14:34 +00:00
|
|
|
Region string
|
|
|
|
Datacenter string
|
|
|
|
Port int
|
|
|
|
Bootstrap bool
|
|
|
|
Expect int
|
|
|
|
MajorVersion int
|
|
|
|
MinorVersion int
|
2017-09-07 23:56:15 +00:00
|
|
|
Build version.Version
|
2017-11-22 00:29:11 +00:00
|
|
|
RaftVersion int
|
2016-05-28 01:14:34 +00:00
|
|
|
Addr net.Addr
|
2017-12-18 21:16:23 +00:00
|
|
|
RPCAddr net.Addr
|
2017-09-07 23:56:15 +00:00
|
|
|
Status serf.MemberStatus
|
2018-09-20 00:13:37 +00:00
|
|
|
NonVoter bool
|
2015-06-03 11:35:48 +00:00
|
|
|
}
|
|
|
|
|
2015-06-04 10:33:12 +00:00
|
|
|
func (s *serverParts) String() string {
|
|
|
|
return fmt.Sprintf("%s (Addr: %s) (DC: %s)",
|
|
|
|
s.Name, s.Addr, s.Datacenter)
|
|
|
|
}
|
|
|
|
|
2018-01-30 06:01:42 +00:00
|
|
|
func (s *serverParts) Copy() *serverParts {
|
|
|
|
ns := new(serverParts)
|
|
|
|
*ns = *s
|
|
|
|
return ns
|
|
|
|
}
|
|
|
|
|
2015-06-03 11:35:48 +00:00
|
|
|
// Returns if a member is a Nomad server. Returns a boolean,
|
|
|
|
// and a struct with the various important components
|
|
|
|
func isNomadServer(m serf.Member) (bool, *serverParts) {
|
|
|
|
if m.Tags["role"] != "nomad" {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
2017-11-27 22:46:37 +00:00
|
|
|
id := "unknown"
|
|
|
|
if v, ok := m.Tags["id"]; ok {
|
|
|
|
id = v
|
|
|
|
}
|
2015-06-03 11:35:48 +00:00
|
|
|
region := m.Tags["region"]
|
|
|
|
datacenter := m.Tags["dc"]
|
|
|
|
_, bootstrap := m.Tags["bootstrap"]
|
|
|
|
|
|
|
|
expect := 0
|
2018-01-16 21:35:32 +00:00
|
|
|
expectStr, ok := m.Tags["expect"]
|
2015-06-03 11:35:48 +00:00
|
|
|
var err error
|
|
|
|
if ok {
|
2018-01-16 21:35:32 +00:00
|
|
|
expect, err = strconv.Atoi(expectStr)
|
2015-06-03 11:35:48 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-18 21:16:23 +00:00
|
|
|
// If the server is missing the rpc_addr tag, default to the serf advertise addr
|
2018-01-16 21:35:32 +00:00
|
|
|
rpcIP := net.ParseIP(m.Tags["rpc_addr"])
|
|
|
|
if rpcIP == nil {
|
|
|
|
rpcIP = m.Addr
|
2017-12-18 21:16:23 +00:00
|
|
|
}
|
|
|
|
|
2018-01-16 21:35:32 +00:00
|
|
|
portStr := m.Tags["port"]
|
|
|
|
port, err := strconv.Atoi(portStr)
|
2015-06-03 11:35:48 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
2018-01-16 21:35:32 +00:00
|
|
|
buildVersion, err := version.NewVersion(m.Tags["build"])
|
2017-09-07 23:56:15 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
2016-05-28 01:14:34 +00:00
|
|
|
// The "vsn" tag was Version, which is now the MajorVersion number.
|
|
|
|
majorVersionStr := m.Tags["vsn"]
|
|
|
|
majorVersion, err := strconv.Atoi(majorVersionStr)
|
2015-06-03 11:35:48 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
2016-05-28 01:14:34 +00:00
|
|
|
// To keep some semblance of convention, "mvn" is now the "Minor
|
|
|
|
// Version Number."
|
|
|
|
minorVersionStr := m.Tags["mvn"]
|
|
|
|
minorVersion, err := strconv.Atoi(minorVersionStr)
|
|
|
|
if err != nil {
|
|
|
|
minorVersion = 0
|
|
|
|
}
|
|
|
|
|
2018-01-16 21:35:32 +00:00
|
|
|
raftVsn := 0
|
|
|
|
raftVsnString, ok := m.Tags["raft_vsn"]
|
2017-11-22 00:29:11 +00:00
|
|
|
if ok {
|
2018-01-16 21:35:32 +00:00
|
|
|
raftVsn, err = strconv.Atoi(raftVsnString)
|
2017-11-22 00:29:11 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-20 00:13:37 +00:00
|
|
|
// Check if the server is a non voter
|
|
|
|
_, nonVoter := m.Tags["nonvoter"]
|
|
|
|
|
2015-06-03 11:35:48 +00:00
|
|
|
addr := &net.TCPAddr{IP: m.Addr, Port: port}
|
2018-01-16 21:35:32 +00:00
|
|
|
rpcAddr := &net.TCPAddr{IP: rpcIP, Port: port}
|
2015-06-03 11:35:48 +00:00
|
|
|
parts := &serverParts{
|
2016-05-28 01:14:34 +00:00
|
|
|
Name: m.Name,
|
2017-11-22 00:29:11 +00:00
|
|
|
ID: id,
|
2016-05-28 01:14:34 +00:00
|
|
|
Region: region,
|
|
|
|
Datacenter: datacenter,
|
|
|
|
Port: port,
|
|
|
|
Bootstrap: bootstrap,
|
|
|
|
Expect: expect,
|
|
|
|
Addr: addr,
|
2017-12-18 21:16:23 +00:00
|
|
|
RPCAddr: rpcAddr,
|
2016-05-28 01:14:34 +00:00
|
|
|
MajorVersion: majorVersion,
|
|
|
|
MinorVersion: minorVersion,
|
2018-01-16 21:35:32 +00:00
|
|
|
Build: *buildVersion,
|
|
|
|
RaftVersion: raftVsn,
|
2017-09-07 23:56:15 +00:00
|
|
|
Status: m.Status,
|
2018-09-20 00:13:37 +00:00
|
|
|
NonVoter: nonVoter,
|
2015-06-03 11:35:48 +00:00
|
|
|
}
|
|
|
|
return true, parts
|
|
|
|
}
|
2015-06-05 22:14:08 +00:00
|
|
|
|
2019-04-11 00:15:04 +00:00
|
|
|
// ServersMeetMinimumVersion returns whether the Nomad servers are at least on the
|
|
|
|
// given Nomad version. The checkFailedServers parameter specifies whether version
|
|
|
|
// for the failed servers should be verified.
|
2019-03-04 09:49:32 +00:00
|
|
|
func ServersMeetMinimumVersion(members []serf.Member, minVersion *version.Version, checkFailedServers bool) bool {
|
2017-09-07 23:56:15 +00:00
|
|
|
for _, member := range members {
|
2019-03-04 09:49:32 +00:00
|
|
|
if valid, parts := isNomadServer(member); valid && (parts.Status == serf.StatusAlive || (checkFailedServers && parts.Status == serf.StatusFailed)) {
|
2017-12-18 21:16:23 +00:00
|
|
|
// Check if the versions match - version.LessThan will return true for
|
|
|
|
// 0.8.0-rc1 < 0.8.0, so we want to ignore the metadata
|
|
|
|
versionsMatch := slicesMatch(minVersion.Segments(), parts.Build.Segments())
|
|
|
|
if parts.Build.LessThan(minVersion) && !versionsMatch {
|
2017-09-07 23:56:15 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2017-12-18 21:16:23 +00:00
|
|
|
func slicesMatch(a, b []int) bool {
|
|
|
|
if a == nil && b == nil {
|
|
|
|
return true
|
|
|
|
}
|
2017-11-22 00:29:11 +00:00
|
|
|
|
2017-12-18 21:16:23 +00:00
|
|
|
if a == nil || b == nil {
|
|
|
|
return false
|
|
|
|
}
|
2017-11-22 00:29:11 +00:00
|
|
|
|
2017-12-18 21:16:23 +00:00
|
|
|
if len(a) != len(b) {
|
|
|
|
return false
|
2017-11-22 00:29:11 +00:00
|
|
|
}
|
|
|
|
|
2017-12-18 21:16:23 +00:00
|
|
|
for i := range a {
|
|
|
|
if a[i] != b[i] {
|
|
|
|
return false
|
|
|
|
}
|
2017-11-22 00:29:11 +00:00
|
|
|
}
|
|
|
|
|
2017-12-18 21:16:23 +00:00
|
|
|
return true
|
2017-11-22 00:29:11 +00:00
|
|
|
}
|
|
|
|
|
2015-07-24 00:30:07 +00:00
|
|
|
// shuffleStrings randomly shuffles the list of strings
|
|
|
|
func shuffleStrings(list []string) {
|
|
|
|
for i := range list {
|
|
|
|
j := rand.Intn(i + 1)
|
|
|
|
list[i], list[j] = list[j], list[i]
|
|
|
|
}
|
|
|
|
}
|
2015-08-05 00:13:40 +00:00
|
|
|
|
2019-06-03 18:30:27 +00:00
|
|
|
// partitionAll splits a slice of strings into a slice of slices of strings, each with a max
|
|
|
|
// size of `size`. All entries from the original slice are preserved. The last slice may be
|
|
|
|
// smaller than `size`. The input slice is unmodified
|
|
|
|
func partitionAll(size int, xs []string) [][]string {
|
|
|
|
if size < 1 {
|
2019-06-07 14:58:16 +00:00
|
|
|
return [][]string{xs}
|
2019-06-03 18:30:27 +00:00
|
|
|
}
|
|
|
|
|
2019-06-07 14:58:16 +00:00
|
|
|
out := [][]string{}
|
|
|
|
|
|
|
|
for i := 0; i < len(xs); i += size {
|
|
|
|
j := i + size
|
|
|
|
if j > len(xs) {
|
|
|
|
j = len(xs)
|
|
|
|
}
|
2019-06-03 18:30:27 +00:00
|
|
|
out = append(out, xs[i:j])
|
|
|
|
}
|
|
|
|
|
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
2015-08-05 00:13:40 +00:00
|
|
|
// maxUint64 returns the maximum value
|
2017-10-13 21:36:02 +00:00
|
|
|
func maxUint64(inputs ...uint64) uint64 {
|
|
|
|
l := len(inputs)
|
|
|
|
if l == 0 {
|
|
|
|
return 0
|
|
|
|
} else if l == 1 {
|
|
|
|
return inputs[0]
|
2015-08-05 00:13:40 +00:00
|
|
|
}
|
2017-10-13 21:36:02 +00:00
|
|
|
|
|
|
|
max := inputs[0]
|
|
|
|
for i := 1; i < l; i++ {
|
|
|
|
cur := inputs[i]
|
|
|
|
if cur > max {
|
|
|
|
max = cur
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return max
|
2015-08-05 00:13:40 +00:00
|
|
|
}
|
2018-04-06 18:33:08 +00:00
|
|
|
|
2018-04-09 17:03:51 +00:00
|
|
|
// getNodeForRpc returns a Node struct if the Node supports Node RPC. Otherwise
|
|
|
|
// an error is returned.
|
|
|
|
func getNodeForRpc(snap *state.StateSnapshot, nodeID string) (*structs.Node, error) {
|
|
|
|
node, err := snap.NodeByID(nil, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if node == nil {
|
|
|
|
return nil, fmt.Errorf("Unknown node %q", nodeID)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := nodeSupportsRpc(node); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return node, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var minNodeVersionSupportingRPC = version.Must(version.NewVersion("0.8.0-rc1"))
|
2018-04-06 18:33:08 +00:00
|
|
|
|
|
|
|
// nodeSupportsRpc returns a non-nil error if a Node does not support RPC.
|
|
|
|
func nodeSupportsRpc(node *structs.Node) error {
|
|
|
|
rawNodeVer, ok := node.Attributes["nomad.version"]
|
|
|
|
if !ok {
|
|
|
|
return structs.ErrUnknownNomadVersion
|
|
|
|
}
|
|
|
|
|
|
|
|
nodeVer, err := version.NewVersion(rawNodeVer)
|
|
|
|
if err != nil {
|
|
|
|
return structs.ErrUnknownNomadVersion
|
|
|
|
}
|
|
|
|
|
2018-04-09 17:03:51 +00:00
|
|
|
if nodeVer.LessThan(minNodeVersionSupportingRPC) {
|
2018-04-06 18:33:08 +00:00
|
|
|
return structs.ErrNodeLacksRpc
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
2019-10-01 20:06:24 +00:00
|
|
|
|
|
|
|
// AllocGetter is an interface for retrieving allocations by ID. It is
|
|
|
|
// satisfied by *state.StateStore and *state.StateSnapshot.
|
|
|
|
type AllocGetter interface {
|
|
|
|
AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
// getAlloc retrieves an allocation by ID and namespace. If the allocation is
|
|
|
|
// nil, an error is returned.
|
|
|
|
func getAlloc(state AllocGetter, allocID string) (*structs.Allocation, error) {
|
|
|
|
if allocID == "" {
|
|
|
|
return nil, structs.ErrMissingAllocID
|
|
|
|
}
|
|
|
|
|
|
|
|
alloc, err := state.AllocByID(nil, allocID)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if alloc == nil {
|
|
|
|
return nil, structs.NewErrUnknownAllocation(allocID)
|
|
|
|
}
|
|
|
|
|
|
|
|
return alloc, nil
|
|
|
|
}
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
|
|
|
|
// dropButLastChannel returns a channel that drops all but last value from sourceCh.
|
|
|
|
//
|
|
|
|
// Useful for aggressively consuming sourceCh when intermediate values aren't relevant.
|
|
|
|
//
|
|
|
|
// This function propagates values to result quickly and drops intermediate messages
|
|
|
|
// in best effort basis. Golang scheduler may delay delivery or result in extra
|
|
|
|
// deliveries.
|
|
|
|
//
|
|
|
|
// Consider this function for example:
|
|
|
|
//
|
|
|
|
// ```
|
|
|
|
// src := make(chan bool)
|
|
|
|
// dst := dropButLastChannel(src, nil)
|
|
|
|
//
|
|
|
|
// go func() {
|
|
|
|
// src <- true
|
|
|
|
// src <- false
|
|
|
|
// }()
|
|
|
|
//
|
|
|
|
// // v can be `true` here but is very unlikely
|
|
|
|
// v := <-dst
|
|
|
|
// ```
|
|
|
|
//
|
|
|
|
func dropButLastChannel(sourceCh <-chan bool, shutdownCh <-chan struct{}) chan bool {
|
|
|
|
// buffer the most recent result
|
|
|
|
dst := make(chan bool)
|
|
|
|
|
|
|
|
go func() {
|
2020-01-28 14:38:51 +00:00
|
|
|
// last value received
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
lv := false
|
2020-01-28 14:38:51 +00:00
|
|
|
// ok source was closed
|
|
|
|
ok := false
|
|
|
|
// received message since last delivery to destination
|
|
|
|
messageReceived := false
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
|
|
|
|
DEQUE_SOURCE:
|
|
|
|
// wait for first message
|
|
|
|
select {
|
2020-01-28 14:38:51 +00:00
|
|
|
case lv, ok = <-sourceCh:
|
|
|
|
if !ok {
|
|
|
|
goto SOURCE_CLOSED
|
|
|
|
}
|
|
|
|
messageReceived = true
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
goto ENQUEUE_DST
|
|
|
|
case <-shutdownCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
ENQUEUE_DST:
|
|
|
|
// prioritize draining source first dequeue without blocking
|
|
|
|
for {
|
|
|
|
select {
|
2020-01-28 14:38:51 +00:00
|
|
|
case lv, ok = <-sourceCh:
|
|
|
|
if !ok {
|
|
|
|
goto SOURCE_CLOSED
|
|
|
|
}
|
|
|
|
messageReceived = true
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
default:
|
|
|
|
break ENQUEUE_DST
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// attempt to enqueue but keep monitoring source channel
|
|
|
|
select {
|
2020-01-28 14:38:51 +00:00
|
|
|
case lv, ok = <-sourceCh:
|
|
|
|
if !ok {
|
|
|
|
goto SOURCE_CLOSED
|
|
|
|
}
|
|
|
|
messageReceived = true
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
goto ENQUEUE_DST
|
|
|
|
case dst <- lv:
|
2020-01-28 14:38:51 +00:00
|
|
|
messageReceived = false
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
// enqueued value; back to dequeing from source
|
|
|
|
goto DEQUE_SOURCE
|
|
|
|
case <-shutdownCh:
|
|
|
|
return
|
|
|
|
}
|
2020-01-28 14:38:51 +00:00
|
|
|
|
|
|
|
SOURCE_CLOSED:
|
|
|
|
if messageReceived {
|
|
|
|
select {
|
|
|
|
case dst <- lv:
|
|
|
|
case <-shutdownCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
close(dst)
|
Handle Nomad leadership flapping
Fixes a deadlock in leadership handling if leadership flapped.
Raft propagates leadership transition to Nomad through a NotifyCh channel.
Raft blocks when writing to this channel, so channel must be buffered or
aggressively consumed[1]. Otherwise, Raft blocks indefinitely in `raft.runLeader`
until the channel is consumed[1] and does not move on to executing follower
related logic (in `raft.runFollower`).
While Raft `runLeader` defer function blocks, raft cannot process any other
raft operations. For example, `run{Leader|Follower}` methods consume
`raft.applyCh`, and while runLeader defer is blocked, all raft log applications
or config lookup will block indefinitely.
Sadly, `leaderLoop` and `establishLeader` makes few Raft calls!
`establishLeader` attempts to auto-create autopilot/scheduler config [3]; and
`leaderLoop` attempts to check raft configuration [4]. All of these calls occur
without a timeout.
Thus, if leadership flapped quickly while `leaderLoop/establishLeadership` is
invoked and hit any of these Raft calls, Raft handler _deadlock_ forever.
Depending on how many times it flapped and where exactly we get stuck, I suspect
it's possible to get in the following case:
* Agent metrics/stats http and RPC calls hang as they check raft.Configurations
* raft.State remains in Leader state, and server attempts to handle RPC calls
(e.g. node/alloc updates) and these hang as well
As we create goroutines per RPC call, the number of goroutines grow over time
and may trigger a out of memory errors in addition to missed updates.
[1] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/config.go#L190-L193
[2] https://github.com/hashicorp/raft/blob/d90d6d6bdacf1b35d66940b07be515b074d89e88/raft.go#L425-L436
[3] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L198-L202
[4] https://github.com/hashicorp/nomad/blob/2a89e477465adbe6a88987f0dcb9fe80145d7b2f/nomad/leader.go#L877
2020-01-22 15:55:44 +00:00
|
|
|
}()
|
|
|
|
|
|
|
|
return dst
|
|
|
|
|
|
|
|
}
|