open-consul/agent/consul/stats_fetcher.go

package consul

import (
	"context"
	"log"
	"sync"

	"github.com/hashicorp/consul/agent/consul/autopilot"
	"github.com/hashicorp/consul/agent/metadata"
	"github.com/hashicorp/consul/agent/pool"
	"github.com/hashicorp/serf/serf"
)

// StatsFetcher has two functions for autopilot. First, lets us fetch all the
// stats in parallel so we are taking a sample as close to the same time as
// possible, since we are comparing time-sensitive info for the health check.
// Second, it bounds the time so that one slow RPC can't hold up the health
// check loop; as a side effect of how it implements this, it also limits to
// a single in-flight RPC to any given server, so goroutines don't accumulate
// as we run the health check fairly frequently.
type StatsFetcher struct {
	logger       *log.Logger
	pool         *pool.ConnPool
	datacenter   string
	inflight     map[string]struct{}
	inflightLock sync.Mutex
}

// NewStatsFetcher returns a stats fetcher.
func NewStatsFetcher(logger *log.Logger, pool *pool.ConnPool, datacenter string) *StatsFetcher {
	return &StatsFetcher{
		logger:     logger,
		pool:       pool,
		datacenter: datacenter,
		inflight:   make(map[string]struct{}),
	}
}

// fetch does the RPC to fetch the server stats from a single server. We don't
// cancel this when the context is canceled because we only want one in-flight
// RPC to each server, so we let it finish and then clean up the in-flight
// tracking.
func (f *StatsFetcher) fetch(server *metadata.Server, replyCh chan *autopilot.ServerStats) {
	var args struct{}
	var reply autopilot.ServerStats
	err := f.pool.RPC(f.datacenter, server.Addr, server.Version, "Status.RaftStats", server.UseTLS, &args, &reply)
	if err != nil {
		f.logger.Printf("[WARN] consul: error getting server health from %q: %v",
			server.Name, err)
	} else {
		replyCh <- &reply
	}

	f.inflightLock.Lock()
	delete(f.inflight, server.ID)
	f.inflightLock.Unlock()
}

// Fetch will attempt to query all the servers in parallel.
func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats {
	type workItem struct {
		server  *metadata.Server
		replyCh chan *autopilot.ServerStats
	}
	var servers []*metadata.Server
	for _, s := range members {
		if ok, parts := metadata.IsConsulServer(s); ok {
			servers = append(servers, parts)
		}
	}

	// Skip any servers that have inflight requests.
	var work []*workItem
	f.inflightLock.Lock()
	for _, server := range servers {
		if _, ok := f.inflight[server.ID]; ok {
			f.logger.Printf("[WARN] consul: error getting server health from %q: last request still outstanding",
				server.Name)
		} else {
			workItem := &workItem{
				server:  server,
				replyCh: make(chan *autopilot.ServerStats, 1),
			}
			work = append(work, workItem)
			f.inflight[server.ID] = struct{}{}
			go f.fetch(workItem.server, workItem.replyCh)
		}
	}
	f.inflightLock.Unlock()

	// Now wait for the results to come in, or for the context to be
	// canceled.
	replies := make(map[string]*autopilot.ServerStats)
	for _, workItem := range work {
		// Drain the reply first if there is one.
		select {
		case reply := <-workItem.replyCh:
			replies[workItem.server.ID] = reply
			continue
		default:
		}

		select {
		case reply := <-workItem.replyCh:
			replies[workItem.server.ID] = reply

		case <-ctx.Done():
			f.logger.Printf("[WARN] consul: error getting server health from %q: %v",
				workItem.server.Name, ctx.Err())

			f.inflightLock.Lock()
			delete(f.inflight, workItem.server.ID)
			f.inflightLock.Unlock()
		}
	}
	return replies
}
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`package consul`

			`import (`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`"context"`
			`"log"`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`"sync"`

Move autopilot to a standalone package 2017-12-12 00:38:52 +00:00			`"github.com/hashicorp/consul/agent/consul/autopilot"`
agent: move agent/consul/agent to agent/metadata 2017-07-06 10:48:37 +00:00			`"github.com/hashicorp/consul/agent/metadata"`
agent: move conn pool for muxed connections into separate pkg 2017-06-15 13:16:16 +00:00			`"github.com/hashicorp/consul/agent/pool"`
More refactoring to make autopilot consul-agnostic 2017-12-13 01:45:03 +00:00			`"github.com/hashicorp/serf/serf"`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`)`

Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`// StatsFetcher has two functions for autopilot. First, lets us fetch all the`
			`// stats in parallel so we are taking a sample as close to the same time as`
			`// possible, since we are comparing time-sensitive info for the health check.`
			`// Second, it bounds the time so that one slow RPC can't hold up the health`
			`// check loop; as a side effect of how it implements this, it also limits to`
			`// a single in-flight RPC to any given server, so goroutines don't accumulate`
			`// as we run the health check fairly frequently.`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`type StatsFetcher struct {`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`logger *log.Logger`
agent: move conn pool for muxed connections into separate pkg 2017-06-15 13:16:16 +00:00			`pool *pool.ConnPool`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`datacenter string`
			`inflight map[string]struct{}`
			`inflightLock sync.Mutex`
			`}`

			`// NewStatsFetcher returns a stats fetcher.`
agent: move conn pool for muxed connections into separate pkg 2017-06-15 13:16:16 +00:00			`func NewStatsFetcher(logger log.Logger, pool pool.ConnPool, datacenter string) *StatsFetcher {`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`return &StatsFetcher{`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`logger: logger,`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`pool: pool,`
			`datacenter: datacenter,`
			`inflight: make(map[string]struct{}),`
			`}`
			`}`

Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`// fetch does the RPC to fetch the server stats from a single server. We don't`
			`// cancel this when the context is canceled because we only want one in-flight`
			`// RPC to each server, so we let it finish and then clean up the in-flight`
			`// tracking.`
Move autopilot to a standalone package 2017-12-12 00:38:52 +00:00			`func (f StatsFetcher) fetch(server metadata.Server, replyCh chan *autopilot.ServerStats) {`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`var args struct{}`
Move autopilot to a standalone package 2017-12-12 00:38:52 +00:00			`var reply autopilot.ServerStats`
Add a path for transitioning to TLS on an existing cluster (#3001) Fixes #1705 2017-05-10 21:25:48 +00:00			`err := f.pool.RPC(f.datacenter, server.Addr, server.Version, "Status.RaftStats", server.UseTLS, &args, &reply)`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`if err != nil {`
Adds a warning if the context is canceled. 2017-03-20 16:27:28 +00:00			`f.logger.Printf("[WARN] consul: error getting server health from %q: %v",`
			`server.Name, err)`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`} else {`
			`replyCh <- &reply`
			`}`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`f.inflightLock.Lock()`
			`delete(f.inflight, server.ID)`
			`f.inflightLock.Unlock()`
			`}`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`// Fetch will attempt to query all the servers in parallel.`
More refactoring to make autopilot consul-agnostic 2017-12-13 01:45:03 +00:00			`func (f StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]autopilot.ServerStats {`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`type workItem struct {`
agent: move agent/consul/agent to agent/metadata 2017-07-06 10:48:37 +00:00			`server *metadata.Server`
Move autopilot to a standalone package 2017-12-12 00:38:52 +00:00			`replyCh chan *autopilot.ServerStats`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`}`
More refactoring to make autopilot consul-agnostic 2017-12-13 01:45:03 +00:00			`var servers []*metadata.Server`
			`for _, s := range members {`
			`if ok, parts := metadata.IsConsulServer(s); ok {`
			`servers = append(servers, parts)`
			`}`
			`}`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`// Skip any servers that have inflight requests.`
More refactoring to make autopilot consul-agnostic 2017-12-13 01:45:03 +00:00			`var work []*workItem`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`f.inflightLock.Lock()`
			`for _, server := range servers {`
			`if _, ok := f.inflight[server.ID]; ok {`
Adds a warning if the context is canceled. 2017-03-20 16:27:28 +00:00			`f.logger.Printf("[WARN] consul: error getting server health from %q: last request still outstanding",`
			`server.Name)`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`} else {`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`workItem := &workItem{`
			`server: server,`
Move autopilot to a standalone package 2017-12-12 00:38:52 +00:00			`replyCh: make(chan *autopilot.ServerStats, 1),`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`}`
			`work = append(work, workItem)`
			`f.inflight[server.ID] = struct{}{}`
			`go f.fetch(workItem.server, workItem.replyCh)`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`}`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`}`
			`f.inflightLock.Unlock()`

			`// Now wait for the results to come in, or for the context to be`
			`// canceled.`
Move autopilot to a standalone package 2017-12-12 00:38:52 +00:00			`replies := make(map[string]*autopilot.ServerStats)`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`for _, workItem := range work {`
Fix stats fetcher healthcheck RPCs not being independent 2018-08-14 21:23:52 +00:00			`// Drain the reply first if there is one.`
			`select {`
			`case reply := <-workItem.replyCh:`
			`replies[workItem.server.ID] = reply`
			`continue`
			`default:`
			`}`

Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`select {`
			`case reply := <-workItem.replyCh:`
			`replies[workItem.server.ID] = reply`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`case <-ctx.Done():`
Adds a warning if the context is canceled. 2017-03-20 16:27:28 +00:00			`f.logger.Printf("[WARN] consul: error getting server health from %q: %v",`
			`workItem.server.Name, ctx.Err())`
Clean up StatsFetcher work when context is exceeded (#6086) 2019-07-12 14:23:28 +00:00
			`f.inflightLock.Lock()`
			`delete(f.inflight, workItem.server.ID)`
			`f.inflightLock.Unlock()`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`}`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`}`
Converts the stats fetch from serial to parallel and snaps the last index. 2017-03-20 03:48:42 +00:00			`return replies`
Adds a stats fetcher to make sure we don't block the autopilot loop. 2017-03-18 01:42:28 +00:00			`}`