open-nomad/nomad/stats_fetcher.go

package nomad

import (
	"context"
	"sync"

	log "github.com/hashicorp/go-hclog"

	"github.com/hashicorp/consul/agent/consul/autopilot"
	"github.com/hashicorp/nomad/helper/pool"
	"github.com/hashicorp/serf/serf"
)

// StatsFetcher has two functions for autopilot. First, lets us fetch all the
// stats in parallel so we are taking a sample as close to the same time as
// possible, since we are comparing time-sensitive info for the health check.
// Second, it bounds the time so that one slow RPC can't hold up the health
// check loop; as a side effect of how it implements this, it also limits to
// a single in-flight RPC to any given server, so goroutines don't accumulate
// as we run the health check fairly frequently.
type StatsFetcher struct {
	logger       log.Logger
	pool         *pool.ConnPool
	region       string
	inflight     map[string]struct{}
	inflightLock sync.Mutex
}

// NewStatsFetcher returns a stats fetcher.
func NewStatsFetcher(logger log.Logger, pool *pool.ConnPool, region string) *StatsFetcher {
	return &StatsFetcher{
		logger:   logger.Named("stats_fetcher"),
		pool:     pool,
		region:   region,
		inflight: make(map[string]struct{}),
	}
}

// fetch does the RPC to fetch the server stats from a single server. We don't
// cancel this when the context is canceled because we only want one in-flight
// RPC to each server, so we let it finish and then clean up the in-flight
// tracking.
func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.ServerStats) {
	var args struct{}
	var reply autopilot.ServerStats
	err := f.pool.RPC(f.region, server.Addr, "Status.RaftStats", &args, &reply)
	if err != nil {
		f.logger.Warn("failed retrieving server health", "server", server.Name, "error", err)
	} else {
		replyCh <- &reply
	}

	f.inflightLock.Lock()
	delete(f.inflight, server.ID)
	f.inflightLock.Unlock()
}

// Fetch will attempt to query all the servers in parallel.
func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats {
	type workItem struct {
		server  *serverParts
		replyCh chan *autopilot.ServerStats
	}
	var servers []*serverParts
	for _, s := range members {
		if ok, parts := isNomadServer(s); ok {
			servers = append(servers, parts)
		}
	}

	// Skip any servers that have inflight requests.
	var work []*workItem
	f.inflightLock.Lock()
	for _, server := range servers {
		if _, ok := f.inflight[server.ID]; ok {
			f.logger.Warn("failed retrieving server health; last request still outstanding", "server", server.Name)
		} else {
			workItem := &workItem{
				server:  server,
				replyCh: make(chan *autopilot.ServerStats, 1),
			}
			work = append(work, workItem)
			f.inflight[server.ID] = struct{}{}
			go f.fetch(workItem.server, workItem.replyCh)
		}
	}
	f.inflightLock.Unlock()

	// Now wait for the results to come in, or for the context to be
	// canceled.
	replies := make(map[string]*autopilot.ServerStats)
	for _, workItem := range work {
		select {
		case reply := <-workItem.replyCh:
			replies[workItem.server.ID] = reply

		case <-ctx.Done():
			f.logger.Warn("failed retrieving server health", "server", workItem.server.Name, "error", ctx.Err())
		}
	}
	return replies
}
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`package nomad`

			`import (`
			`"context"`
			`"sync"`

server 2018-09-15 23:23:13 +00:00			`log "github.com/hashicorp/go-hclog"`

Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`"github.com/hashicorp/consul/agent/consul/autopilot"`
Refactor 2018-01-12 21:58:44 +00:00			`"github.com/hashicorp/nomad/helper/pool"`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`"github.com/hashicorp/serf/serf"`
			`)`

			`// StatsFetcher has two functions for autopilot. First, lets us fetch all the`
			`// stats in parallel so we are taking a sample as close to the same time as`
			`// possible, since we are comparing time-sensitive info for the health check.`
			`// Second, it bounds the time so that one slow RPC can't hold up the health`
			`// check loop; as a side effect of how it implements this, it also limits to`
			`// a single in-flight RPC to any given server, so goroutines don't accumulate`
			`// as we run the health check fairly frequently.`
			`type StatsFetcher struct {`
server 2018-09-15 23:23:13 +00:00			`logger log.Logger`
Refactor 2018-01-12 21:58:44 +00:00			`pool *pool.ConnPool`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`region string`
			`inflight map[string]struct{}`
			`inflightLock sync.Mutex`
			`}`

			`// NewStatsFetcher returns a stats fetcher.`
server 2018-09-15 23:23:13 +00:00			`func NewStatsFetcher(logger log.Logger, pool pool.ConnPool, region string) StatsFetcher {`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`return &StatsFetcher{`
server 2018-09-15 23:23:13 +00:00			`logger: logger.Named("stats_fetcher"),`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`pool: pool,`
			`region: region,`
			`inflight: make(map[string]struct{}),`
			`}`
			`}`

			`// fetch does the RPC to fetch the server stats from a single server. We don't`
			`// cancel this when the context is canceled because we only want one in-flight`
			`// RPC to each server, so we let it finish and then clean up the in-flight`
			`// tracking.`
			`func (f StatsFetcher) fetch(server serverParts, replyCh chan *autopilot.ServerStats) {`
			`var args struct{}`
			`var reply autopilot.ServerStats`
core: remove all traces of unused protocol version Nomad inherited protocol version numbering configuration from Consul and Serf, but unlike those projects Nomad has never used it. Nomad's `protocol_version` has always been `1`. While the code is effectively unused and therefore poses no runtime risks to leave, I felt like removing it was best because: 1. Nomad's RPC subsystem has been able to evolve extensively without needing to increment the version number. 2. Nomad's HTTP API has evolved extensively without increment `API{Major,Minor}Version`. If we want to version the HTTP API in the future, I doubt this is the mechanism we would choose. 3. The presence of the `server.protocol_version` configuration parameter is confusing since `server.raft_protocol` is an important parameter for operators to consider. Even more confusing is that there is a distinct Serf protocol version which is included in `nomad server members` output under the heading `Protocol`. `raft_protocol` is the only protocol version relevant to Nomad developers and operators. The other protocol versions are either deadcode or have never changed (Serf). 4. If we were to need to version the RPC, HTTP API, or Serf protocols, I don't think these configuration parameters and variables are the best choice. If we come to that point we should choose a versioning scheme based on the use case and modern best practices -- not this 6+ year old dead code. 2021-12-01 22:36:02 +00:00			`err := f.pool.RPC(f.region, server.Addr, "Status.RaftStats", &args, &reply)`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`if err != nil {`
server 2018-09-15 23:23:13 +00:00			`f.logger.Warn("failed retrieving server health", "server", server.Name, "error", err)`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`} else {`
			`replyCh <- &reply`
			`}`

			`f.inflightLock.Lock()`
			`delete(f.inflight, server.ID)`
			`f.inflightLock.Unlock()`
			`}`

			`// Fetch will attempt to query all the servers in parallel.`
			`func (f StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]autopilot.ServerStats {`
			`type workItem struct {`
			`server *serverParts`
			`replyCh chan *autopilot.ServerStats`
			`}`
			`var servers []*serverParts`
			`for _, s := range members {`
			`if ok, parts := isNomadServer(s); ok {`
			`servers = append(servers, parts)`
			`}`
			`}`

			`// Skip any servers that have inflight requests.`
			`var work []*workItem`
			`f.inflightLock.Lock()`
			`for _, server := range servers {`
			`if _, ok := f.inflight[server.ID]; ok {`
server 2018-09-15 23:23:13 +00:00			`f.logger.Warn("failed retrieving server health; last request still outstanding", "server", server.Name)`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`} else {`
			`workItem := &workItem{`
			`server: server,`
			`replyCh: make(chan *autopilot.ServerStats, 1),`
			`}`
			`work = append(work, workItem)`
			`f.inflight[server.ID] = struct{}{}`
			`go f.fetch(workItem.server, workItem.replyCh)`
			`}`
			`}`
			`f.inflightLock.Unlock()`

			`// Now wait for the results to come in, or for the context to be`
			`// canceled.`
			`replies := make(map[string]*autopilot.ServerStats)`
			`for _, workItem := range work {`
			`select {`
			`case reply := <-workItem.replyCh:`
			`replies[workItem.server.ID] = reply`

			`case <-ctx.Done():`
server 2018-09-15 23:23:13 +00:00			`f.logger.Warn("failed retrieving server health", "server", workItem.server.Name, "error", ctx.Err())`
Add autopilot functionality based on Consul's autopilot 2017-12-18 21:16:23 +00:00			`}`
			`}`
			`return replies`
			`}`