open-nomad/client/util.go
Mahmood Ali c5f5a1fcb9 client: defensive against getting stale alloc updates
When fetching node alloc assignments, be defensive against a stale read before
killing local nodes allocs.

The bug is when both client and servers are restarting and the client requests
the node allocation for the node, it might get stale data as server hasn't
finished applying all the restored raft transaction to store.

Consequently, client would kill and destroy the alloc locally, just to fetch it
again moments later when server store is up to date.

The bug can be reproduced quite reliably with single node setup (configured with
persistence).  I suspect it's too edge-casey to occur in production cluster with
multiple servers, but we may need to examine leader failover scenarios more closely.

In this commit, we only remove and destroy allocs if the removal index is more
recent than the alloc index. This seems like a cheap resiliency fix we already
use for detecting alloc updates.

A more proper fix would be to ensure that a nomad server only serves
RPC calls when state store is fully restored or up to date in leadership
transition cases.
2019-06-29 04:17:35 -05:00

77 lines
1.8 KiB
Go

package client
import (
"fmt"
"math/rand"
"time"
"github.com/hashicorp/nomad/nomad/structs"
)
// diffResult is used to return the sets that result from a diff
type diffResult struct {
added []*structs.Allocation
removed []string
updated []*structs.Allocation
ignore []string
}
func (d *diffResult) GoString() string {
return fmt.Sprintf("allocs: (added %d) (removed %d) (updated %d) (ignore %d)",
len(d.added), len(d.removed), len(d.updated), len(d.ignore))
}
// diffAllocs is used to diff the existing and updated allocations
// to see what has happened.
func diffAllocs(existing map[string]uint64, allocs *allocUpdates) *diffResult {
// Scan the existing allocations
result := &diffResult{}
for existID, existIndex := range existing {
// Check if the alloc was updated or filtered because an update wasn't
// needed.
alloc, pulled := allocs.pulled[existID]
_, filtered := allocs.filtered[existID]
// If not updated or filtered, removed
if !pulled && !filtered && allocs.index > existIndex {
result.removed = append(result.removed, existID)
continue
}
// Check for an update
if pulled && alloc.AllocModifyIndex > existIndex {
result.updated = append(result.updated, alloc)
continue
}
// Ignore this
result.ignore = append(result.ignore, existID)
}
// Scan the updated allocations for any that are new
for id, pulled := range allocs.pulled {
if _, ok := existing[id]; !ok {
result.added = append(result.added, pulled)
}
}
return result
}
// shuffleStrings randomly shuffles the list of strings
func shuffleStrings(list []string) {
for i := range list {
j := rand.Intn(i + 1)
list[i], list[j] = list[j], list[i]
}
}
// stoppedTimer returns a timer that's stopped and wouldn't fire until
// it's reset
func stoppedTimer() *time.Timer {
timer := time.NewTimer(0)
if !timer.Stop() {
<-timer.C
}
return timer
}