2017-03-18 01:42:28 +00:00
|
|
|
package consul
|
|
|
|
|
|
|
|
import (
|
2017-03-20 03:48:42 +00:00
|
|
|
"context"
|
|
|
|
"log"
|
2017-03-18 01:42:28 +00:00
|
|
|
"sync"
|
|
|
|
|
2017-12-12 00:38:52 +00:00
|
|
|
"github.com/hashicorp/consul/agent/consul/autopilot"
|
2017-07-06 10:48:37 +00:00
|
|
|
"github.com/hashicorp/consul/agent/metadata"
|
2017-06-15 13:16:16 +00:00
|
|
|
"github.com/hashicorp/consul/agent/pool"
|
2017-12-13 01:45:03 +00:00
|
|
|
"github.com/hashicorp/serf/serf"
|
2017-03-18 01:42:28 +00:00
|
|
|
)
|
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
// StatsFetcher has two functions for autopilot. First, lets us fetch all the
|
|
|
|
// stats in parallel so we are taking a sample as close to the same time as
|
|
|
|
// possible, since we are comparing time-sensitive info for the health check.
|
|
|
|
// Second, it bounds the time so that one slow RPC can't hold up the health
|
|
|
|
// check loop; as a side effect of how it implements this, it also limits to
|
|
|
|
// a single in-flight RPC to any given server, so goroutines don't accumulate
|
|
|
|
// as we run the health check fairly frequently.
|
2017-03-18 01:42:28 +00:00
|
|
|
type StatsFetcher struct {
|
2017-03-20 03:48:42 +00:00
|
|
|
logger *log.Logger
|
2017-06-15 13:16:16 +00:00
|
|
|
pool *pool.ConnPool
|
2017-03-18 01:42:28 +00:00
|
|
|
datacenter string
|
|
|
|
inflight map[string]struct{}
|
|
|
|
inflightLock sync.Mutex
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewStatsFetcher returns a stats fetcher.
|
2017-06-15 13:16:16 +00:00
|
|
|
func NewStatsFetcher(logger *log.Logger, pool *pool.ConnPool, datacenter string) *StatsFetcher {
|
2017-03-18 01:42:28 +00:00
|
|
|
return &StatsFetcher{
|
2017-03-20 03:48:42 +00:00
|
|
|
logger: logger,
|
2017-03-18 01:42:28 +00:00
|
|
|
pool: pool,
|
|
|
|
datacenter: datacenter,
|
|
|
|
inflight: make(map[string]struct{}),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
// fetch does the RPC to fetch the server stats from a single server. We don't
|
|
|
|
// cancel this when the context is canceled because we only want one in-flight
|
|
|
|
// RPC to each server, so we let it finish and then clean up the in-flight
|
|
|
|
// tracking.
|
2017-12-12 00:38:52 +00:00
|
|
|
func (f *StatsFetcher) fetch(server *metadata.Server, replyCh chan *autopilot.ServerStats) {
|
2017-03-20 03:48:42 +00:00
|
|
|
var args struct{}
|
2017-12-12 00:38:52 +00:00
|
|
|
var reply autopilot.ServerStats
|
2017-05-10 21:25:48 +00:00
|
|
|
err := f.pool.RPC(f.datacenter, server.Addr, server.Version, "Status.RaftStats", server.UseTLS, &args, &reply)
|
2017-03-20 03:48:42 +00:00
|
|
|
if err != nil {
|
2017-03-20 16:27:28 +00:00
|
|
|
f.logger.Printf("[WARN] consul: error getting server health from %q: %v",
|
|
|
|
server.Name, err)
|
2017-03-20 03:48:42 +00:00
|
|
|
} else {
|
|
|
|
replyCh <- &reply
|
|
|
|
}
|
2017-03-18 01:42:28 +00:00
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
f.inflightLock.Lock()
|
|
|
|
delete(f.inflight, server.ID)
|
|
|
|
f.inflightLock.Unlock()
|
|
|
|
}
|
2017-03-18 01:42:28 +00:00
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
// Fetch will attempt to query all the servers in parallel.
|
2017-12-13 01:45:03 +00:00
|
|
|
func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats {
|
2017-03-20 03:48:42 +00:00
|
|
|
type workItem struct {
|
2017-07-06 10:48:37 +00:00
|
|
|
server *metadata.Server
|
2017-12-12 00:38:52 +00:00
|
|
|
replyCh chan *autopilot.ServerStats
|
2017-03-20 03:48:42 +00:00
|
|
|
}
|
2017-12-13 01:45:03 +00:00
|
|
|
var servers []*metadata.Server
|
|
|
|
for _, s := range members {
|
|
|
|
if ok, parts := metadata.IsConsulServer(s); ok {
|
|
|
|
servers = append(servers, parts)
|
|
|
|
}
|
|
|
|
}
|
2017-03-18 01:42:28 +00:00
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
// Skip any servers that have inflight requests.
|
2017-12-13 01:45:03 +00:00
|
|
|
var work []*workItem
|
2017-03-20 03:48:42 +00:00
|
|
|
f.inflightLock.Lock()
|
|
|
|
for _, server := range servers {
|
|
|
|
if _, ok := f.inflight[server.ID]; ok {
|
2017-03-20 16:27:28 +00:00
|
|
|
f.logger.Printf("[WARN] consul: error getting server health from %q: last request still outstanding",
|
|
|
|
server.Name)
|
2017-03-18 01:42:28 +00:00
|
|
|
} else {
|
2017-03-20 03:48:42 +00:00
|
|
|
workItem := &workItem{
|
|
|
|
server: server,
|
2017-12-12 00:38:52 +00:00
|
|
|
replyCh: make(chan *autopilot.ServerStats, 1),
|
2017-03-20 03:48:42 +00:00
|
|
|
}
|
|
|
|
work = append(work, workItem)
|
|
|
|
f.inflight[server.ID] = struct{}{}
|
|
|
|
go f.fetch(workItem.server, workItem.replyCh)
|
2017-03-18 01:42:28 +00:00
|
|
|
}
|
2017-03-20 03:48:42 +00:00
|
|
|
}
|
|
|
|
f.inflightLock.Unlock()
|
|
|
|
|
|
|
|
// Now wait for the results to come in, or for the context to be
|
|
|
|
// canceled.
|
2017-12-12 00:38:52 +00:00
|
|
|
replies := make(map[string]*autopilot.ServerStats)
|
2017-03-20 03:48:42 +00:00
|
|
|
for _, workItem := range work {
|
2018-08-14 21:23:52 +00:00
|
|
|
// Drain the reply first if there is one.
|
|
|
|
select {
|
|
|
|
case reply := <-workItem.replyCh:
|
|
|
|
replies[workItem.server.ID] = reply
|
|
|
|
continue
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
select {
|
|
|
|
case reply := <-workItem.replyCh:
|
|
|
|
replies[workItem.server.ID] = reply
|
2017-03-18 01:42:28 +00:00
|
|
|
|
2017-03-20 03:48:42 +00:00
|
|
|
case <-ctx.Done():
|
2017-03-20 16:27:28 +00:00
|
|
|
f.logger.Printf("[WARN] consul: error getting server health from %q: %v",
|
|
|
|
workItem.server.Name, ctx.Err())
|
2019-07-12 14:23:28 +00:00
|
|
|
|
|
|
|
f.inflightLock.Lock()
|
|
|
|
delete(f.inflight, workItem.server.ID)
|
|
|
|
f.inflightLock.Unlock()
|
2017-03-20 03:48:42 +00:00
|
|
|
}
|
2017-03-18 01:42:28 +00:00
|
|
|
}
|
2017-03-20 03:48:42 +00:00
|
|
|
return replies
|
2017-03-18 01:42:28 +00:00
|
|
|
}
|