2020-08-08 01:08:43 +00:00
package agent
import (
2021-06-14 22:37:05 +00:00
"context"
2020-08-08 01:08:43 +00:00
"fmt"
"io"
"net"
2020-08-17 18:12:04 +00:00
"net/http"
2020-10-14 20:47:16 +00:00
"sync"
2020-08-08 01:08:43 +00:00
"time"
2021-06-14 22:37:05 +00:00
"github.com/armon/go-metrics"
2020-11-13 02:12:12 +00:00
"github.com/armon/go-metrics/prometheus"
2020-10-05 21:31:35 +00:00
"github.com/hashicorp/go-hclog"
"google.golang.org/grpc/grpclog"
2020-08-08 01:08:43 +00:00
autoconf "github.com/hashicorp/consul/agent/auto-config"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/config"
2020-09-14 22:31:07 +00:00
"github.com/hashicorp/consul/agent/consul"
2021-02-25 21:22:30 +00:00
"github.com/hashicorp/consul/agent/consul/fsm"
"github.com/hashicorp/consul/agent/consul/usagemetrics"
2022-03-22 12:40:24 +00:00
grpc "github.com/hashicorp/consul/agent/grpc/private"
"github.com/hashicorp/consul/agent/grpc/private/resolver"
2021-02-25 21:22:30 +00:00
"github.com/hashicorp/consul/agent/local"
2020-08-08 01:08:43 +00:00
"github.com/hashicorp/consul/agent/pool"
2020-08-27 15:23:52 +00:00
"github.com/hashicorp/consul/agent/router"
2022-04-06 21:33:05 +00:00
"github.com/hashicorp/consul/agent/rpc/middleware"
2021-02-25 21:22:30 +00:00
"github.com/hashicorp/consul/agent/submatview"
2020-08-08 01:08:43 +00:00
"github.com/hashicorp/consul/agent/token"
2021-05-14 18:59:13 +00:00
"github.com/hashicorp/consul/agent/xds"
2020-08-08 01:08:43 +00:00
"github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
)
// TODO: BaseDeps should be renamed in the future once more of Agent.Start
// has been moved out in front of Agent.New, and we can better see the setup
// dependencies.
type BaseDeps struct {
2020-09-14 22:31:07 +00:00
consul . Deps // TODO: un-embed
RuntimeConfig * config . RuntimeConfig
MetricsHandler MetricsHandler
AutoConfig * autoconf . AutoConfig // TODO: use an interface
Cache * cache . Cache
2021-02-25 21:22:30 +00:00
ViewStore * submatview . Store
2022-03-31 19:11:49 +00:00
WatchedFiles [ ] string
2020-08-08 01:08:43 +00:00
}
2020-08-17 18:12:04 +00:00
// MetricsHandler provides an http.Handler for displaying metrics.
type MetricsHandler interface {
DisplayMetrics ( resp http . ResponseWriter , req * http . Request ) ( interface { } , error )
2021-06-14 22:37:05 +00:00
Stream ( ctx context . Context , encoder metrics . Encoder )
2020-08-17 18:12:04 +00:00
}
2020-12-21 18:25:32 +00:00
type ConfigLoader func ( source config . Source ) ( config . LoadResult , error )
2020-08-08 01:08:43 +00:00
func NewBaseDeps ( configLoader ConfigLoader , logOut io . Writer ) ( BaseDeps , error ) {
d := BaseDeps { }
2020-12-21 18:25:32 +00:00
result , err := configLoader ( nil )
2020-08-08 01:08:43 +00:00
if err != nil {
return d , err
}
2022-03-31 19:11:49 +00:00
d . WatchedFiles = result . WatchedFiles
2020-12-21 18:25:32 +00:00
cfg := result . RuntimeConfig
2020-08-19 17:17:05 +00:00
logConf := cfg . Logging
logConf . Name = logging . Agent
2020-08-19 16:09:35 +00:00
d . Logger , err = logging . Setup ( logConf , logOut )
2020-08-08 01:08:43 +00:00
if err != nil {
return d , err
}
2021-04-26 15:57:07 +00:00
grpcLogInitOnce . Do ( func ( ) {
grpclog . SetLoggerV2 ( logging . NewGRPCLogger ( cfg . Logging . LogLevel , d . Logger ) )
} )
2020-08-08 01:08:43 +00:00
2020-12-21 18:25:32 +00:00
for _ , w := range result . Warnings {
2020-08-08 01:08:43 +00:00
d . Logger . Warn ( w )
}
cfg . NodeID , err = newNodeIDFromConfig ( cfg , d . Logger )
if err != nil {
return d , fmt . Errorf ( "failed to setup node ID: %w" , err )
}
2021-10-13 16:25:30 +00:00
isServer := result . RuntimeConfig . ServerMode
gauges , counters , summaries := getPrometheusDefs ( cfg . Telemetry , isServer )
2020-11-16 20:44:47 +00:00
cfg . Telemetry . PrometheusOpts . GaugeDefinitions = gauges
cfg . Telemetry . PrometheusOpts . CounterDefinitions = counters
cfg . Telemetry . PrometheusOpts . SummaryDefinitions = summaries
d . MetricsHandler , err = lib . InitTelemetry ( cfg . Telemetry )
2020-08-08 01:08:43 +00:00
if err != nil {
return d , fmt . Errorf ( "failed to initialize telemetry: %w" , err )
}
2022-03-18 10:46:58 +00:00
d . TLSConfigurator , err = tlsutil . NewConfigurator ( cfg . TLS , d . Logger )
2020-08-08 01:08:43 +00:00
if err != nil {
return d , err
}
d . RuntimeConfig = cfg
d . Tokens = new ( token . Store )
2020-08-17 23:30:25 +00:00
2021-02-12 17:43:36 +00:00
cfg . Cache . Logger = d . Logger . Named ( "cache" )
2020-08-08 01:08:43 +00:00
// cache-types are not registered yet, but they won't be used until the components are started.
d . Cache = cache . New ( cfg . Cache )
2021-02-25 21:22:30 +00:00
d . ViewStore = submatview . NewStore ( d . Logger . Named ( "viewstore" ) )
2020-08-08 01:08:43 +00:00
d . ConnPool = newConnPool ( cfg , d . Logger , d . TLSConfigurator )
2021-08-24 21:28:44 +00:00
builder := resolver . NewServerResolverBuilder ( resolver . Config {
// Set the authority to something sufficiently unique so any usage in
// tests would be self-isolating in the global resolver map, while also
// not incurring a huge penalty for non-test code.
Authority : cfg . Datacenter + "." + string ( cfg . NodeID ) ,
} )
2021-06-01 22:31:52 +00:00
resolver . Register ( builder )
2021-08-24 21:28:44 +00:00
d . GRPCConnPool = grpc . NewClientConnPool ( grpc . ClientConnPoolConfig {
Servers : builder ,
SrcAddr : d . ConnPool . SrcAddr ,
TLSWrapper : grpc . TLSWrapper ( d . TLSConfigurator . OutgoingRPCWrapper ( ) ) ,
ALPNWrapper : grpc . ALPNWrapper ( d . TLSConfigurator . OutgoingALPNRPCWrapper ( ) ) ,
UseTLSForDC : d . TLSConfigurator . UseTLS ,
DialingFromServer : cfg . ServerMode ,
DialingFromDatacenter : cfg . Datacenter ,
} )
2021-07-22 18:58:08 +00:00
d . LeaderForwarder = builder
2020-09-08 21:31:47 +00:00
2020-09-14 20:16:44 +00:00
d . Router = router . NewRouter ( d . Logger , cfg . Datacenter , fmt . Sprintf ( "%s.%s" , cfg . NodeName , cfg . Datacenter ) , builder )
2020-08-27 15:23:52 +00:00
2021-05-17 20:01:32 +00:00
// this needs to happen prior to creating auto-config as some of the dependencies
// must also be passed to auto-config
d , err = initEnterpriseBaseDeps ( d , cfg )
if err != nil {
return d , err
}
2020-08-08 01:08:43 +00:00
acConf := autoconf . Config {
2021-05-17 20:01:32 +00:00
DirectRPC : d . ConnPool ,
Logger : d . Logger ,
Loader : configLoader ,
ServerProvider : d . Router ,
TLSConfigurator : d . TLSConfigurator ,
Cache : d . Cache ,
Tokens : d . Tokens ,
2021-05-20 14:07:23 +00:00
EnterpriseConfig : initEnterpriseAutoConfig ( d . EnterpriseDeps , cfg ) ,
2020-08-08 01:08:43 +00:00
}
2021-05-17 20:01:32 +00:00
2020-08-08 01:08:43 +00:00
d . AutoConfig , err = autoconf . New ( acConf )
if err != nil {
return d , err
}
2022-04-06 21:33:05 +00:00
d . NewRequestRecorderFunc = middleware . NewRequestRecorder
d . GetNetRPCInterceptorFunc = middleware . GetNetRPCInterceptor
2021-05-17 20:01:32 +00:00
return d , nil
2020-08-08 01:08:43 +00:00
}
2021-04-26 15:57:07 +00:00
// grpcLogInitOnce because the test suite will call NewBaseDeps in many tests and
// causes data races when it is re-initialized.
var grpcLogInitOnce sync . Once
2020-08-08 01:08:43 +00:00
func newConnPool ( config * config . RuntimeConfig , logger hclog . Logger , tls * tlsutil . Configurator ) * pool . ConnPool {
var rpcSrcAddr * net . TCPAddr
if ! ipaddr . IsAny ( config . RPCBindAddr ) {
rpcSrcAddr = & net . TCPAddr { IP : config . RPCBindAddr . IP }
}
pool := & pool . ConnPool {
Server : config . ServerMode ,
SrcAddr : rpcSrcAddr ,
Logger : logger . StandardLogger ( & hclog . StandardLoggerOptions { InferLevels : true } ) ,
TLSConfigurator : tls ,
Datacenter : config . Datacenter ,
}
if config . ServerMode {
pool . MaxTime = 2 * time . Minute
pool . MaxStreams = 64
} else {
2020-09-14 22:31:07 +00:00
// MaxTime controls how long we keep an idle connection open to a server.
// 127s was chosen as the first prime above 120s
// (arbitrarily chose to use a prime) with the intent of reusing
// connections who are used by once-a-minute cron(8) jobs *and* who
// use a 60s jitter window (e.g. in vixie cron job execution can
// drift by up to 59s per job, or 119s for a once-a-minute cron job).
2020-08-08 01:08:43 +00:00
pool . MaxTime = 127 * time . Second
pool . MaxStreams = 32
}
return pool
}
2020-10-14 20:47:16 +00:00
2020-11-13 02:12:12 +00:00
// getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends
// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics.
2021-10-13 16:25:30 +00:00
func getPrometheusDefs ( cfg lib . TelemetryConfig , isServer bool ) ( [ ] prometheus . GaugeDefinition , [ ] prometheus . CounterDefinition , [ ] prometheus . SummaryDefinition ) {
2021-05-04 14:36:53 +00:00
// TODO: "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
raftGauges := [ ] prometheus . GaugeDefinition {
{
Name : [ ] string { "raft" , "fsm" , "lastRestoreDuration" } ,
Help : "This measures how long the last FSM restore (from disk or leader) took." ,
} ,
{
Name : [ ] string { "raft" , "leader" , "oldestLogAge" } ,
Help : "This measures how old the oldest log in the leader's log store is." ,
} ,
}
2020-11-16 22:01:12 +00:00
// Build slice of slices for all gauge definitions
2020-11-13 02:12:12 +00:00
var gauges = [ ] [ ] prometheus . GaugeDefinition {
2020-11-14 00:26:08 +00:00
cache . Gauges ,
2020-11-13 02:12:12 +00:00
consul . RPCGauges ,
consul . SessionGauges ,
grpc . StatsGauges ,
2021-05-14 18:59:13 +00:00
xds . StatsGauges ,
2020-11-13 02:12:12 +00:00
usagemetrics . Gauges ,
2021-04-23 21:05:33 +00:00
consul . ReplicationGauges ,
2021-10-27 19:23:29 +00:00
CertExpirationGauges ,
2020-12-09 14:16:53 +00:00
Gauges ,
2021-05-04 14:36:53 +00:00
raftGauges ,
2020-11-13 02:12:12 +00:00
}
2021-05-04 14:36:53 +00:00
2021-10-13 16:25:30 +00:00
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
if isServer {
2021-10-19 20:49:23 +00:00
gauges = append ( gauges ,
consul . AutopilotGauges ,
consul . LeaderCertExpirationGauges )
2021-10-13 16:25:30 +00:00
}
2020-11-16 22:01:12 +00:00
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
2020-11-13 02:12:12 +00:00
var gaugeDefs [ ] prometheus . GaugeDefinition
for _ , g := range gauges {
2020-11-13 21:18:04 +00:00
// Set Consul to each definition's namespace
2020-11-16 22:01:12 +00:00
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
2020-11-13 21:18:04 +00:00
var withService [ ] prometheus . GaugeDefinition
for _ , gauge := range g {
2020-11-16 22:01:12 +00:00
gauge . Name = append ( [ ] string { cfg . MetricsPrefix } , gauge . Name ... )
2020-11-13 21:18:04 +00:00
withService = append ( withService , gauge )
}
gaugeDefs = append ( gaugeDefs , withService ... )
2020-11-13 02:12:12 +00:00
}
raftCounters := [ ] prometheus . CounterDefinition {
2020-11-14 00:26:08 +00:00
// TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
2020-11-13 02:12:12 +00:00
{
2020-11-13 21:18:04 +00:00
Name : [ ] string { "raft" , "apply" } ,
2020-11-13 02:12:12 +00:00
Help : "This counts the number of Raft transactions occurring over the interval." ,
} ,
{
2020-11-13 21:18:04 +00:00
Name : [ ] string { "raft" , "state" , "candidate" } ,
2020-11-13 02:12:12 +00:00
Help : "This increments whenever a Consul server starts an election." ,
} ,
{
2020-11-13 21:18:04 +00:00
Name : [ ] string { "raft" , "state" , "leader" } ,
2020-11-13 02:12:12 +00:00
Help : "This increments whenever a Consul server becomes a leader." ,
} ,
}
var counters = [ ] [ ] prometheus . CounterDefinition {
CatalogCounters ,
2020-11-14 00:26:08 +00:00
cache . Counters ,
2020-11-13 02:12:12 +00:00
consul . ACLCounters ,
consul . CatalogCounters ,
consul . ClientCounters ,
consul . RPCCounters ,
grpc . StatsCounters ,
local . StateCounters ,
raftCounters ,
}
2020-11-16 22:01:12 +00:00
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
2020-11-13 02:12:12 +00:00
var counterDefs [ ] prometheus . CounterDefinition
for _ , c := range counters {
2020-11-16 22:01:12 +00:00
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
2020-11-13 21:18:04 +00:00
var withService [ ] prometheus . CounterDefinition
for _ , counter := range c {
2020-11-16 22:01:12 +00:00
counter . Name = append ( [ ] string { cfg . MetricsPrefix } , counter . Name ... )
2020-11-13 21:18:04 +00:00
withService = append ( withService , counter )
}
counterDefs = append ( counterDefs , withService ... )
2020-11-13 02:12:12 +00:00
}
raftSummaries := [ ] prometheus . SummaryDefinition {
2020-11-14 00:26:08 +00:00
// TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
2020-11-13 02:12:12 +00:00
{
2020-11-13 21:18:04 +00:00
Name : [ ] string { "raft" , "commitTime" } ,
2020-11-13 02:12:12 +00:00
Help : "This measures the time it takes to commit a new entry to the Raft log on the leader." ,
} ,
{
2020-11-13 21:18:04 +00:00
Name : [ ] string { "raft" , "leader" , "lastContact" } ,
2020-11-13 02:12:12 +00:00
Help : "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease." ,
} ,
2021-05-04 14:36:53 +00:00
{
Name : [ ] string { "raft" , "snapshot" , "persist" } ,
Help : "Measures the time it takes raft to write a new snapshot to disk." ,
} ,
{
Name : [ ] string { "raft" , "rpc" , "installSnapshot" } ,
Help : "Measures the time it takes the raft leader to install a snapshot on a follower that is catching up after being down or has just joined the cluster." ,
} ,
2020-11-13 02:12:12 +00:00
}
var summaries = [ ] [ ] prometheus . SummaryDefinition {
HTTPSummaries ,
consul . ACLSummaries ,
consul . ACLEndpointSummaries ,
consul . CatalogSummaries ,
consul . FederationStateSummaries ,
consul . IntentionSummaries ,
consul . KVSummaries ,
2020-11-14 00:26:08 +00:00
consul . LeaderSummaries ,
2020-11-13 02:12:12 +00:00
consul . PreparedQuerySummaries ,
consul . RPCSummaries ,
2020-11-14 00:26:08 +00:00
consul . SegmentOSSSummaries ,
2020-11-13 02:12:12 +00:00
consul . SessionSummaries ,
2020-11-14 00:26:08 +00:00
consul . SessionEndpointSummaries ,
2020-11-13 02:12:12 +00:00
consul . TxnSummaries ,
2020-11-14 00:26:08 +00:00
fsm . CommandsSummaries ,
fsm . SnapshotSummaries ,
2020-11-13 02:12:12 +00:00
raftSummaries ,
}
2020-11-16 22:01:12 +00:00
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
2020-11-13 02:12:12 +00:00
var summaryDefs [ ] prometheus . SummaryDefinition
for _ , s := range summaries {
2020-11-16 22:01:12 +00:00
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
2020-11-13 21:18:04 +00:00
var withService [ ] prometheus . SummaryDefinition
for _ , summary := range s {
2020-11-16 22:01:12 +00:00
summary . Name = append ( [ ] string { cfg . MetricsPrefix } , summary . Name ... )
2020-11-13 21:18:04 +00:00
withService = append ( withService , summary )
}
summaryDefs = append ( summaryDefs , withService ... )
2020-11-13 02:12:12 +00:00
}
2020-11-16 20:44:47 +00:00
return gaugeDefs , counterDefs , summaryDefs
2020-11-13 02:12:12 +00:00
}