open-consul/agent/local.go

803 lines
23 KiB
Go
Raw Normal View History

package agent
import (
"fmt"
2014-01-21 19:52:25 +00:00
"log"
"reflect"
"strings"
"sync"
2014-02-07 19:58:24 +00:00
"sync/atomic"
"time"
"github.com/hashicorp/consul/acl"
New config parser, HCL support, multiple bind addrs (#3480) * new config parser for agent This patch implements a new config parser for the consul agent which makes the following changes to the previous implementation: * add HCL support * all configuration fragments in tests and for default config are expressed as HCL fragments * HCL fragments can be provided on the command line so that they can eventually replace the command line flags. * HCL/JSON fragments are parsed into a temporary Config structure which can be merged using reflection (all values are pointers). The existing merge logic of overwrite for values and append for slices has been preserved. * A single builder process generates a typed runtime configuration for the agent. The new implementation is more strict and fails in the builder process if no valid runtime configuration can be generated. Therefore, additional validations in other parts of the code should be removed. The builder also pre-computes all required network addresses so that no address/port magic should be required where the configuration is used and should therefore be removed. * Upgrade github.com/hashicorp/hcl to support int64 * improve error messages * fix directory permission test * Fix rtt test * Fix ForceLeave test * Skip performance test for now until we know what to do * Update github.com/hashicorp/memberlist to update log prefix * Make memberlist use the default logger * improve config error handling * do not fail on non-existing data-dir * experiment with non-uniform timeouts to get a handle on stalled leader elections * Run tests for packages separately to eliminate the spurious port conflicts * refactor private address detection and unify approach for ipv4 and ipv6. Fixes #2825 * do not allow unix sockets for DNS * improve bind and advertise addr error handling * go through builder using test coverage * minimal update to the docs * more coverage tests fixed * more tests * fix makefile * cleanup * fix port conflicts with external port server 'porter' * stop test server on error * do not run api test that change global ENV concurrently with the other tests * Run remaining api tests concurrently * no need for retry with the port number service * monkey patch race condition in go-sockaddr until we understand why that fails * monkey patch hcl decoder race condidtion until we understand why that fails * monkey patch spurious errors in strings.EqualFold from here * add test for hcl decoder race condition. Run with go test -parallel 128 * Increase timeout again * cleanup * don't log port allocations by default * use base command arg parsing to format help output properly * handle -dc deprecation case in Build * switch autopilot.max_trailing_logs to int * remove duplicate test case * remove unused methods * remove comments about flag/config value inconsistencies * switch got and want around since the error message was misleading. * Removes a stray debug log. * Removes a stray newline in imports. * Fixes TestACL_Version8. * Runs go fmt. * Adds a default case for unknown address types. * Reoders and reformats some imports. * Adds some comments and fixes typos. * Reorders imports. * add unix socket support for dns later * drop all deprecated flags and arguments * fix wrong field name * remove stray node-id file * drop unnecessary patch section in test * drop duplicate test * add test for LeaveOnTerm and SkipLeaveOnInt in client mode * drop "bla" and add clarifying comment for the test * split up tests to support enterprise/non-enterprise tests * drop raft multiplier and derive values during build phase * sanitize runtime config reflectively and add test * detect invalid config fields * fix tests with invalid config fields * use different values for wan sanitiziation test * drop recursor in favor of recursors * allow dns_config.udp_answer_limit to be zero * make sure tests run on machines with multiple ips * Fix failing tests in a few more places by providing a bind address in the test * Gets rid of skipped TestAgent_CheckPerformanceSettings and adds case for builder. * Add porter to server_test.go to make tests there less flaky * go fmt
2017-09-25 18:40:42 +00:00
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/types"
)
const (
syncStaggerIntv = 3 * time.Second
syncRetryIntv = 15 * time.Second
)
// syncStatus is used to represent the difference between
// the local and remote state, and if action needs to be taken
type syncStatus struct {
inSync bool // Is this in sync with the server
}
// localStateConfig is the configuration for the localState. It is
2017-07-07 07:18:33 +00:00
// populated during NewLocalAgent from the agent configuration to avoid
// race conditions with the agent configuration.
type localStateConfig struct {
AEInterval time.Duration
AdvertiseAddr string
CheckUpdateInterval time.Duration
Datacenter string
NodeID types.NodeID
NodeName string
TaggedAddresses map[string]string
Tokens *token.Store
}
// localState is used to represent the node's services,
// and checks. We used it to perform anti-entropy with the
// catalog representation
type localState struct {
2014-02-07 19:58:24 +00:00
// paused is used to check if we are paused. Must be the first
// element due to a go bug.
paused int32
sync.RWMutex
2014-01-21 19:52:25 +00:00
logger *log.Logger
2014-01-21 19:52:25 +00:00
// Config is the agent config
config localStateConfig
2014-01-21 19:52:25 +00:00
// delegate is the consul interface to use for keeping in sync
delegate delegate
// nodeInfoInSync tracks whether the server has our correct top-level
2017-01-18 22:26:42 +00:00
// node information in sync
nodeInfoInSync bool
// Services tracks the local services
services map[string]*structs.NodeService
serviceStatus map[string]syncStatus
serviceTokens map[string]string
// Checks tracks the local checks
checks map[types.CheckID]*structs.HealthCheck
checkStatus map[types.CheckID]syncStatus
checkTokens map[types.CheckID]string
checkCriticalTime map[types.CheckID]time.Time
// Used to track checks that are being deferred
deferCheck map[types.CheckID]*time.Timer
// metadata tracks the local metadata fields
metadata map[string]string
// consulCh is used to inform of a change to the known
// consul nodes. This may be used to retry a sync run
consulCh chan struct{}
// triggerCh is used to inform of a change to local state
// that requires anti-entropy with the server
triggerCh chan struct{}
}
// NewLocalState creates a is used to initialize the local state
New config parser, HCL support, multiple bind addrs (#3480) * new config parser for agent This patch implements a new config parser for the consul agent which makes the following changes to the previous implementation: * add HCL support * all configuration fragments in tests and for default config are expressed as HCL fragments * HCL fragments can be provided on the command line so that they can eventually replace the command line flags. * HCL/JSON fragments are parsed into a temporary Config structure which can be merged using reflection (all values are pointers). The existing merge logic of overwrite for values and append for slices has been preserved. * A single builder process generates a typed runtime configuration for the agent. The new implementation is more strict and fails in the builder process if no valid runtime configuration can be generated. Therefore, additional validations in other parts of the code should be removed. The builder also pre-computes all required network addresses so that no address/port magic should be required where the configuration is used and should therefore be removed. * Upgrade github.com/hashicorp/hcl to support int64 * improve error messages * fix directory permission test * Fix rtt test * Fix ForceLeave test * Skip performance test for now until we know what to do * Update github.com/hashicorp/memberlist to update log prefix * Make memberlist use the default logger * improve config error handling * do not fail on non-existing data-dir * experiment with non-uniform timeouts to get a handle on stalled leader elections * Run tests for packages separately to eliminate the spurious port conflicts * refactor private address detection and unify approach for ipv4 and ipv6. Fixes #2825 * do not allow unix sockets for DNS * improve bind and advertise addr error handling * go through builder using test coverage * minimal update to the docs * more coverage tests fixed * more tests * fix makefile * cleanup * fix port conflicts with external port server 'porter' * stop test server on error * do not run api test that change global ENV concurrently with the other tests * Run remaining api tests concurrently * no need for retry with the port number service * monkey patch race condition in go-sockaddr until we understand why that fails * monkey patch hcl decoder race condidtion until we understand why that fails * monkey patch spurious errors in strings.EqualFold from here * add test for hcl decoder race condition. Run with go test -parallel 128 * Increase timeout again * cleanup * don't log port allocations by default * use base command arg parsing to format help output properly * handle -dc deprecation case in Build * switch autopilot.max_trailing_logs to int * remove duplicate test case * remove unused methods * remove comments about flag/config value inconsistencies * switch got and want around since the error message was misleading. * Removes a stray debug log. * Removes a stray newline in imports. * Fixes TestACL_Version8. * Runs go fmt. * Adds a default case for unknown address types. * Reoders and reformats some imports. * Adds some comments and fixes typos. * Reorders imports. * add unix socket support for dns later * drop all deprecated flags and arguments * fix wrong field name * remove stray node-id file * drop unnecessary patch section in test * drop duplicate test * add test for LeaveOnTerm and SkipLeaveOnInt in client mode * drop "bla" and add clarifying comment for the test * split up tests to support enterprise/non-enterprise tests * drop raft multiplier and derive values during build phase * sanitize runtime config reflectively and add test * detect invalid config fields * fix tests with invalid config fields * use different values for wan sanitiziation test * drop recursor in favor of recursors * allow dns_config.udp_answer_limit to be zero * make sure tests run on machines with multiple ips * Fix failing tests in a few more places by providing a bind address in the test * Gets rid of skipped TestAgent_CheckPerformanceSettings and adds case for builder. * Add porter to server_test.go to make tests there less flaky * go fmt
2017-09-25 18:40:42 +00:00
func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) *localState {
lc := localStateConfig{
AEInterval: c.AEInterval,
New config parser, HCL support, multiple bind addrs (#3480) * new config parser for agent This patch implements a new config parser for the consul agent which makes the following changes to the previous implementation: * add HCL support * all configuration fragments in tests and for default config are expressed as HCL fragments * HCL fragments can be provided on the command line so that they can eventually replace the command line flags. * HCL/JSON fragments are parsed into a temporary Config structure which can be merged using reflection (all values are pointers). The existing merge logic of overwrite for values and append for slices has been preserved. * A single builder process generates a typed runtime configuration for the agent. The new implementation is more strict and fails in the builder process if no valid runtime configuration can be generated. Therefore, additional validations in other parts of the code should be removed. The builder also pre-computes all required network addresses so that no address/port magic should be required where the configuration is used and should therefore be removed. * Upgrade github.com/hashicorp/hcl to support int64 * improve error messages * fix directory permission test * Fix rtt test * Fix ForceLeave test * Skip performance test for now until we know what to do * Update github.com/hashicorp/memberlist to update log prefix * Make memberlist use the default logger * improve config error handling * do not fail on non-existing data-dir * experiment with non-uniform timeouts to get a handle on stalled leader elections * Run tests for packages separately to eliminate the spurious port conflicts * refactor private address detection and unify approach for ipv4 and ipv6. Fixes #2825 * do not allow unix sockets for DNS * improve bind and advertise addr error handling * go through builder using test coverage * minimal update to the docs * more coverage tests fixed * more tests * fix makefile * cleanup * fix port conflicts with external port server 'porter' * stop test server on error * do not run api test that change global ENV concurrently with the other tests * Run remaining api tests concurrently * no need for retry with the port number service * monkey patch race condition in go-sockaddr until we understand why that fails * monkey patch hcl decoder race condidtion until we understand why that fails * monkey patch spurious errors in strings.EqualFold from here * add test for hcl decoder race condition. Run with go test -parallel 128 * Increase timeout again * cleanup * don't log port allocations by default * use base command arg parsing to format help output properly * handle -dc deprecation case in Build * switch autopilot.max_trailing_logs to int * remove duplicate test case * remove unused methods * remove comments about flag/config value inconsistencies * switch got and want around since the error message was misleading. * Removes a stray debug log. * Removes a stray newline in imports. * Fixes TestACL_Version8. * Runs go fmt. * Adds a default case for unknown address types. * Reoders and reformats some imports. * Adds some comments and fixes typos. * Reorders imports. * add unix socket support for dns later * drop all deprecated flags and arguments * fix wrong field name * remove stray node-id file * drop unnecessary patch section in test * drop duplicate test * add test for LeaveOnTerm and SkipLeaveOnInt in client mode * drop "bla" and add clarifying comment for the test * split up tests to support enterprise/non-enterprise tests * drop raft multiplier and derive values during build phase * sanitize runtime config reflectively and add test * detect invalid config fields * fix tests with invalid config fields * use different values for wan sanitiziation test * drop recursor in favor of recursors * allow dns_config.udp_answer_limit to be zero * make sure tests run on machines with multiple ips * Fix failing tests in a few more places by providing a bind address in the test * Gets rid of skipped TestAgent_CheckPerformanceSettings and adds case for builder. * Add porter to server_test.go to make tests there less flaky * go fmt
2017-09-25 18:40:42 +00:00
AdvertiseAddr: c.AdvertiseAddrLAN.String(),
CheckUpdateInterval: c.CheckUpdateInterval,
Datacenter: c.Datacenter,
NodeID: c.NodeID,
NodeName: c.NodeName,
TaggedAddresses: map[string]string{},
Tokens: tokens,
}
for k, v := range c.TaggedAddresses {
lc.TaggedAddresses[k] = v
}
return &localState{
config: lc,
logger: lg,
services: make(map[string]*structs.NodeService),
serviceStatus: make(map[string]syncStatus),
serviceTokens: make(map[string]string),
checks: make(map[types.CheckID]*structs.HealthCheck),
checkStatus: make(map[types.CheckID]syncStatus),
checkTokens: make(map[types.CheckID]string),
checkCriticalTime: make(map[types.CheckID]time.Time),
deferCheck: make(map[types.CheckID]*time.Timer),
metadata: make(map[string]string),
consulCh: make(chan struct{}, 1),
triggerCh: make(chan struct{}, 1),
}
2014-01-21 19:52:25 +00:00
}
// changeMade is used to trigger an anti-entropy run
func (l *localState) changeMade() {
select {
case l.triggerCh <- struct{}{}:
default:
}
}
// ConsulServerUp is used to inform that a new consul server is now
// up. This can be used to speed up the sync process if we are blocking
// waiting to discover a consul server
func (l *localState) ConsulServerUp() {
select {
case l.consulCh <- struct{}{}:
default:
}
}
// Pause is used to pause state synchronization, this can be
2014-02-07 19:58:24 +00:00
// used to make batch changes
func (l *localState) Pause() {
make Pause()/Resume()/isPaused() behave more like a semaphore see: https://github.com/hashicorp/consul/issues/1173 #1173 Reasoning: somewhere during consul development Pause()/Resume() and PauseSync()/ResumeSync() were added to protect larger changes to agent's localState. A few of the places that it tries to protect are: - (a *Agent) AddService(...) # part of the method - (c *Command) handleReload(...) # almost the whole method - (l *localState) antiEntropy(...)# isPaused() prevents syncChanges() The main problem is, that in the middle of handleReload(...)'s critical section it indirectly (loadServices()) calls AddService(...). AddService() in turn calls Pause() to protect itself against syncChanges(). At the end of AddService() a defered call to Resume() is made. With the current implementation, this releases isPaused() "lock" in the middle of handleReload() allowing antiEntropy to kick in while configuration reload is still in progress. Specifically almost all services and probably all check are unloaded when syncChanges() is allowed to run. This in turn can causes massive service/check de-/re-registration, and since checks are by default registered in the critical state, a majority of services on a node can be marked as failing. It's made worse with automation, often calling `consul reload` in close proximity on many nodes in the cluster. This change basically turns Pause()/Resume() into P()/V() of a garden-variety semaphore. Allowing Pause() to be called multiple times, and releasing isPaused() only after all matching/defered Resumes() are called as well. TODO/NOTE: as with many semaphore implementations, it might be reasonable to panic() if l.paused ever becomes negative.
2015-09-11 16:28:06 +00:00
atomic.AddInt32(&l.paused, 1)
2014-02-07 19:58:24 +00:00
}
// Resume is used to resume state synchronization
func (l *localState) Resume() {
paused := atomic.AddInt32(&l.paused, -1)
if paused < 0 {
panic("unbalanced localState.Resume() detected")
}
2014-02-07 19:58:24 +00:00
l.changeMade()
}
// isPaused is used to check if we are paused
func (l *localState) isPaused() bool {
make Pause()/Resume()/isPaused() behave more like a semaphore see: https://github.com/hashicorp/consul/issues/1173 #1173 Reasoning: somewhere during consul development Pause()/Resume() and PauseSync()/ResumeSync() were added to protect larger changes to agent's localState. A few of the places that it tries to protect are: - (a *Agent) AddService(...) # part of the method - (c *Command) handleReload(...) # almost the whole method - (l *localState) antiEntropy(...)# isPaused() prevents syncChanges() The main problem is, that in the middle of handleReload(...)'s critical section it indirectly (loadServices()) calls AddService(...). AddService() in turn calls Pause() to protect itself against syncChanges(). At the end of AddService() a defered call to Resume() is made. With the current implementation, this releases isPaused() "lock" in the middle of handleReload() allowing antiEntropy to kick in while configuration reload is still in progress. Specifically almost all services and probably all check are unloaded when syncChanges() is allowed to run. This in turn can causes massive service/check de-/re-registration, and since checks are by default registered in the critical state, a majority of services on a node can be marked as failing. It's made worse with automation, often calling `consul reload` in close proximity on many nodes in the cluster. This change basically turns Pause()/Resume() into P()/V() of a garden-variety semaphore. Allowing Pause() to be called multiple times, and releasing isPaused() only after all matching/defered Resumes() are called as well. TODO/NOTE: as with many semaphore implementations, it might be reasonable to panic() if l.paused ever becomes negative.
2015-09-11 16:28:06 +00:00
return atomic.LoadInt32(&l.paused) > 0
2014-02-07 19:58:24 +00:00
}
// ServiceToken returns the configured ACL token for the given
// service ID. If none is present, the agent's token is returned.
func (l *localState) ServiceToken(id string) string {
2015-04-28 18:53:53 +00:00
l.RLock()
defer l.RUnlock()
return l.serviceToken(id)
}
// serviceToken returns an ACL token associated with a service.
func (l *localState) serviceToken(id string) string {
token := l.serviceTokens[id]
if token == "" {
token = l.config.Tokens.UserToken()
}
return token
}
// AddService is used to add a service entry to the local state.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered
func (l *localState) AddService(service *structs.NodeService, token string) {
2014-01-21 00:22:59 +00:00
// Assign the ID if none given
if service.ID == "" && service.Service != "" {
service.ID = service.Service
}
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
l.services[service.ID] = service
l.serviceStatus[service.ID] = syncStatus{}
l.serviceTokens[service.ID] = token
2014-01-21 19:52:25 +00:00
l.changeMade()
}
// RemoveService is used to remove a service entry from the local state.
// The agent will make a best effort to ensure it is deregistered
func (l *localState) RemoveService(serviceID string) error {
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
if _, ok := l.services[serviceID]; ok {
delete(l.services, serviceID)
// Leave the service token around, if any, until we successfully
// delete the service.
l.serviceStatus[serviceID] = syncStatus{inSync: false}
l.changeMade()
} else {
return fmt.Errorf("Service does not exist")
}
return nil
}
// Services returns the locally registered services that the
// agent is aware of and are being kept in sync with the server
2014-01-21 19:52:25 +00:00
func (l *localState) Services() map[string]*structs.NodeService {
services := make(map[string]*structs.NodeService)
l.RLock()
defer l.RUnlock()
2014-01-21 19:52:25 +00:00
for name, serv := range l.services {
services[name] = serv
}
return services
}
// CheckToken is used to return the configured health check token for a
// Check, or if none is configured, the default agent ACL token.
func (l *localState) CheckToken(checkID types.CheckID) string {
2015-04-28 18:53:53 +00:00
l.RLock()
defer l.RUnlock()
return l.checkToken(checkID)
2015-04-28 18:53:53 +00:00
}
// checkToken returns an ACL token associated with a check.
func (l *localState) checkToken(checkID types.CheckID) string {
token := l.checkTokens[checkID]
if token == "" {
token = l.config.Tokens.UserToken()
}
return token
}
// AddCheck is used to add a health check to the local state.
// This entry is persistent and the agent will make a best effort to
// ensure it is registered
func (l *localState) AddCheck(check *structs.HealthCheck, token string) error {
// Set the node name
2014-01-21 19:52:25 +00:00
check.Node = l.config.NodeName
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
// if there is a serviceID associated with the check, make sure it exists before adding it
// NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor
if check.ServiceID != "" && l.services[check.ServiceID] == nil {
return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
}
l.checks[check.CheckID] = check
l.checkStatus[check.CheckID] = syncStatus{}
l.checkTokens[check.CheckID] = token
delete(l.checkCriticalTime, check.CheckID)
l.changeMade()
return nil
}
// RemoveCheck is used to remove a health check from the local state.
// The agent will make a best effort to ensure it is deregistered
func (l *localState) RemoveCheck(checkID types.CheckID) {
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
delete(l.checks, checkID)
// Leave the check token around, if any, until we successfully delete
// the check.
delete(l.checkCriticalTime, checkID)
l.checkStatus[checkID] = syncStatus{inSync: false}
2014-01-21 19:52:25 +00:00
l.changeMade()
}
// UpdateCheck is used to update the status of a check
func (l *localState) UpdateCheck(checkID types.CheckID, status, output string) {
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
2014-01-21 19:52:25 +00:00
check, ok := l.checks[checkID]
if !ok {
return
}
// Update the critical time tracking (this doesn't cause a server updates
// so we can always keep this up to date).
if status == api.HealthCritical {
_, wasCritical := l.checkCriticalTime[checkID]
if !wasCritical {
l.checkCriticalTime[checkID] = time.Now()
}
} else {
delete(l.checkCriticalTime, checkID)
}
// Do nothing if update is idempotent
if check.Status == status && check.Output == output {
return
}
// Defer a sync if the output has changed. This is an optimization around
// frequent updates of output. Instead, we update the output internally,
// and periodically do a write-back to the servers. If there is a status
// change we do the write immediately.
if l.config.CheckUpdateInterval > 0 && check.Status == status {
check.Output = output
if _, ok := l.deferCheck[checkID]; !ok {
intv := time.Duration(uint64(l.config.CheckUpdateInterval)/2) + lib.RandomStagger(l.config.CheckUpdateInterval)
2015-04-23 20:37:20 +00:00
deferSync := time.AfterFunc(intv, func() {
l.Lock()
if _, ok := l.checkStatus[checkID]; ok {
l.checkStatus[checkID] = syncStatus{inSync: false}
l.changeMade()
}
delete(l.deferCheck, checkID)
l.Unlock()
})
l.deferCheck[checkID] = deferSync
}
return
}
// Update status and mark out of sync
check.Status = status
check.Output = output
2014-01-21 19:52:25 +00:00
l.checkStatus[checkID] = syncStatus{inSync: false}
l.changeMade()
}
// Checks returns the locally registered checks that the
// agent is aware of and are being kept in sync with the server
func (l *localState) Checks() map[types.CheckID]*structs.HealthCheck {
checks := make(map[types.CheckID]*structs.HealthCheck)
l.RLock()
defer l.RUnlock()
for checkID, check := range l.checks {
checks[checkID] = check
}
return checks
}
// CriticalCheck is used to return the duration a check has been critical along
// with its associated health check.
type CriticalCheck struct {
CriticalFor time.Duration
Check *structs.HealthCheck
}
// CriticalChecks returns locally registered health checks that the agent is
// aware of and are being kept in sync with the server, and that are in a
// critical state. This also returns information about how long each check has
// been critical.
func (l *localState) CriticalChecks() map[types.CheckID]CriticalCheck {
checks := make(map[types.CheckID]CriticalCheck)
l.RLock()
defer l.RUnlock()
now := time.Now()
for checkID, criticalTime := range l.checkCriticalTime {
checks[checkID] = CriticalCheck{
CriticalFor: now.Sub(criticalTime),
Check: l.checks[checkID],
}
}
return checks
}
// Metadata returns the local node metadata fields that the
// agent is aware of and are being kept in sync with the server
func (l *localState) Metadata() map[string]string {
metadata := make(map[string]string)
l.RLock()
defer l.RUnlock()
for key, value := range l.metadata {
metadata[key] = value
}
return metadata
}
// antiEntropy is a long running method used to perform anti-entropy
// between local and remote state.
2014-01-21 19:52:25 +00:00
func (l *localState) antiEntropy(shutdownCh chan struct{}) {
SYNC:
// Sync our state with the servers
2014-01-21 19:52:25 +00:00
for {
2014-04-14 19:47:58 +00:00
err := l.setSyncState()
if err == nil {
break
}
l.logger.Printf("[ERR] agent: failed to sync remote state: %v", err)
select {
case <-l.consulCh:
// Stagger the retry on leader election, avoid a thundering heard
select {
case <-time.After(lib.RandomStagger(aeScale(syncStaggerIntv, len(l.delegate.LANMembers())))):
case <-shutdownCh:
return
}
case <-time.After(syncRetryIntv + lib.RandomStagger(aeScale(syncRetryIntv, len(l.delegate.LANMembers())))):
2014-04-14 19:47:58 +00:00
case <-shutdownCh:
return
}
}
// Force-trigger AE to pickup any changes
2014-01-21 19:52:25 +00:00
l.changeMade()
// Schedule the next full sync, with a random stagger
aeIntv := aeScale(l.config.AEInterval, len(l.delegate.LANMembers()))
aeIntv = aeIntv + lib.RandomStagger(aeIntv)
aeTimer := time.After(aeIntv)
// Wait for sync events
for {
select {
case <-aeTimer:
goto SYNC
2014-01-21 19:52:25 +00:00
case <-l.triggerCh:
2014-02-07 19:58:24 +00:00
// Skip the sync if we are paused
if l.isPaused() {
continue
}
2014-01-21 19:52:25 +00:00
if err := l.syncChanges(); err != nil {
l.logger.Printf("[ERR] agent: failed to sync changes: %v", err)
}
2014-01-21 19:52:25 +00:00
case <-shutdownCh:
return
}
}
}
// setSyncState does a read of the server state, and updates
// the local syncStatus as appropriate
2014-01-21 19:52:25 +00:00
func (l *localState) setSyncState() error {
req := structs.NodeSpecificRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
QueryOptions: structs.QueryOptions{Token: l.config.Tokens.AgentToken()},
}
var out1 structs.IndexedNodeServices
var out2 structs.IndexedHealthChecks
if e := l.delegate.RPC("Catalog.NodeServices", &req, &out1); e != nil {
return e
}
if err := l.delegate.RPC("Health.NodeChecks", &req, &out2); err != nil {
return err
}
checks := out2.HealthChecks
2014-01-21 19:52:25 +00:00
l.Lock()
defer l.Unlock()
// Check the node info
if out1.NodeServices == nil || out1.NodeServices.Node == nil ||
2017-01-18 22:26:42 +00:00
out1.NodeServices.Node.ID != l.config.NodeID ||
!reflect.DeepEqual(out1.NodeServices.Node.TaggedAddresses, l.config.TaggedAddresses) ||
!reflect.DeepEqual(out1.NodeServices.Node.Meta, l.metadata) {
l.nodeInfoInSync = false
}
// Check all our services
services := make(map[string]*structs.NodeService)
if out1.NodeServices != nil {
services = out1.NodeServices.Services
}
for id := range l.services {
// If the local service doesn't exist remotely, then sync it
if _, ok := services[id]; !ok {
l.serviceStatus[id] = syncStatus{inSync: false}
}
}
for id, service := range services {
// If we don't have the service locally, deregister it
existing, ok := l.services[id]
if !ok {
// The consul service is created automatically, and does
// not need to be deregistered.
if id == structs.ConsulServiceID {
continue
}
l.serviceStatus[id] = syncStatus{inSync: false}
continue
2014-03-05 23:03:23 +00:00
}
// If our definition is different, we need to update it. Make a
// copy so that we don't retain a pointer to any actual state
// store info for in-memory RPCs.
if existing.EnableTagOverride {
existing.Tags = make([]string, len(service.Tags))
copy(existing.Tags, service.Tags)
}
equal := existing.IsSame(service)
l.serviceStatus[id] = syncStatus{inSync: equal}
}
2015-10-13 03:30:11 +00:00
// Index the remote health checks to improve efficiency
checkIndex := make(map[types.CheckID]*structs.HealthCheck, len(checks))
2015-10-13 03:30:11 +00:00
for _, check := range checks {
checkIndex[check.CheckID] = check
}
// Sync any check which doesn't exist on the remote side
for id := range l.checks {
2015-10-13 03:30:11 +00:00
if _, ok := checkIndex[id]; !ok {
l.checkStatus[id] = syncStatus{inSync: false}
}
}
for _, check := range checks {
// If we don't have the check locally, deregister it
id := check.CheckID
2014-01-21 19:52:25 +00:00
existing, ok := l.checks[id]
if !ok {
// The Serf check is created automatically, and does not
// need to be deregistered.
if id == structs.SerfCheckID {
continue
}
l.checkStatus[id] = syncStatus{inSync: false}
continue
}
// If our definition is different, we need to update it
var equal bool
if l.config.CheckUpdateInterval == 0 {
equal = existing.IsSame(check)
} else {
// Copy the existing check before potentially modifying
// it before the compare operation.
eCopy := existing.Clone()
// Copy the server's check before modifying, otherwise
2016-04-11 15:58:17 +00:00
// in-memory RPCs will have side effects.
cCopy := check.Clone()
// If there's a defer timer active then we've got a
// potentially spammy check so we don't sync the output
// during this sweep since the timer will mark the check
// out of sync for us. Otherwise, it is safe to sync the
// output now. This is especially important for checks
// that don't change state after they are created, in
// which case we'd never see their output synced back ever.
if _, ok := l.deferCheck[id]; ok {
eCopy.Output = ""
cCopy.Output = ""
}
equal = eCopy.IsSame(cCopy)
}
// Update the status
2014-01-21 19:52:25 +00:00
l.checkStatus[id] = syncStatus{inSync: equal}
}
return nil
}
// syncChanges is used to scan the status our local services and checks
// and update any that are out of sync with the server
2014-01-21 19:52:25 +00:00
func (l *localState) syncChanges() error {
l.Lock()
defer l.Unlock()
// We will do node-level info syncing at the end, since it will get
// updated by a service or check sync anyway, given how the register
// API works.
// Sync the services
for id, status := range l.serviceStatus {
if _, ok := l.services[id]; !ok {
if err := l.deleteService(id); err != nil {
return err
}
} else if !status.inSync {
if err := l.syncService(id); err != nil {
return err
}
2014-04-23 19:21:47 +00:00
} else {
l.logger.Printf("[DEBUG] agent: Service '%s' in sync", id)
}
}
// Sync the checks
for id, status := range l.checkStatus {
if _, ok := l.checks[id]; !ok {
if err := l.deleteCheck(id); err != nil {
return err
}
} else if !status.inSync {
// Cancel a deferred sync
if timer := l.deferCheck[id]; timer != nil {
timer.Stop()
delete(l.deferCheck, id)
}
if err := l.syncCheck(id); err != nil {
return err
}
2014-04-23 19:21:47 +00:00
} else {
l.logger.Printf("[DEBUG] agent: Check '%s' in sync", id)
}
}
// Now sync the node level info if we need to, and didn't do any of
// the other sync operations.
if !l.nodeInfoInSync {
if err := l.syncNodeInfo(); err != nil {
return err
}
} else {
l.logger.Printf("[DEBUG] agent: Node info in sync")
}
return nil
}
// deleteService is used to delete a service from the server
2014-01-21 19:52:25 +00:00
func (l *localState) deleteService(id string) error {
if id == "" {
return fmt.Errorf("ServiceID missing")
}
req := structs.DeregisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
ServiceID: id,
2015-04-28 18:53:53 +00:00
WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)},
}
var out struct{}
err := l.delegate.RPC("Catalog.Deregister", &req, &out)
if err == nil || strings.Contains(err.Error(), "Unknown service") {
2014-01-21 19:52:25 +00:00
delete(l.serviceStatus, id)
delete(l.serviceTokens, id)
2014-01-21 19:52:25 +00:00
l.logger.Printf("[INFO] agent: Deregistered service '%s'", id)
return nil
} else if acl.IsErrPermissionDenied(err) {
l.serviceStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[WARN] agent: Service '%s' deregistration blocked by ACLs", id)
return nil
}
return err
}
// deleteCheck is used to delete a check from the server
func (l *localState) deleteCheck(id types.CheckID) error {
if id == "" {
return fmt.Errorf("CheckID missing")
}
req := structs.DeregisterRequest{
Datacenter: l.config.Datacenter,
Node: l.config.NodeName,
CheckID: id,
2015-04-28 18:53:53 +00:00
WriteRequest: structs.WriteRequest{Token: l.checkToken(id)},
}
var out struct{}
err := l.delegate.RPC("Catalog.Deregister", &req, &out)
if err == nil || strings.Contains(err.Error(), "Unknown check") {
2014-01-21 19:52:25 +00:00
delete(l.checkStatus, id)
delete(l.checkTokens, id)
2014-01-21 19:52:25 +00:00
l.logger.Printf("[INFO] agent: Deregistered check '%s'", id)
return nil
} else if acl.IsErrPermissionDenied(err) {
l.checkStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[WARN] agent: Check '%s' deregistration blocked by ACLs", id)
return nil
}
return err
}
// syncService is used to sync a service to the server
2014-01-21 19:52:25 +00:00
func (l *localState) syncService(id string) error {
req := structs.RegisterRequest{
Datacenter: l.config.Datacenter,
2017-01-18 22:26:42 +00:00
ID: l.config.NodeID,
Node: l.config.NodeName,
Address: l.config.AdvertiseAddr,
TaggedAddresses: l.config.TaggedAddresses,
NodeMeta: l.metadata,
Service: l.services[id],
WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)},
}
// If the service has associated checks that are out of sync,
// piggyback them on the service sync so they are part of the
// same transaction and are registered atomically. We only let
// checks ride on service registrations with the same token,
// otherwise we need to register them separately so they don't
// pick up privileges from the service token.
var checks structs.HealthChecks
for _, check := range l.checks {
if check.ServiceID == id && (l.serviceToken(id) == l.checkToken(check.CheckID)) {
if stat, ok := l.checkStatus[check.CheckID]; !ok || !stat.inSync {
checks = append(checks, check)
}
}
}
// Backwards-compatibility for Consul < 0.5
if len(checks) == 1 {
req.Check = checks[0]
} else {
req.Checks = checks
}
var out struct{}
err := l.delegate.RPC("Catalog.Register", &req, &out)
if err == nil {
2014-01-21 19:52:25 +00:00
l.serviceStatus[id] = syncStatus{inSync: true}
// Given how the register API works, this info is also updated
// every time we sync a service.
l.nodeInfoInSync = true
2014-01-21 19:52:25 +00:00
l.logger.Printf("[INFO] agent: Synced service '%s'", id)
for _, check := range checks {
l.checkStatus[check.CheckID] = syncStatus{inSync: true}
}
} else if acl.IsErrPermissionDenied(err) {
l.serviceStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[WARN] agent: Service '%s' registration blocked by ACLs", id)
for _, check := range checks {
l.checkStatus[check.CheckID] = syncStatus{inSync: true}
}
return nil
}
return err
}
2015-12-08 09:45:01 +00:00
// syncCheck is used to sync a check to the server
func (l *localState) syncCheck(id types.CheckID) error {
// Pull in the associated service if any
check := l.checks[id]
var service *structs.NodeService
if check.ServiceID != "" {
if serv, ok := l.services[check.ServiceID]; ok {
service = serv
2015-01-14 07:23:52 +00:00
}
}
req := structs.RegisterRequest{
Datacenter: l.config.Datacenter,
2017-01-18 22:26:42 +00:00
ID: l.config.NodeID,
Node: l.config.NodeName,
Address: l.config.AdvertiseAddr,
TaggedAddresses: l.config.TaggedAddresses,
NodeMeta: l.metadata,
Service: service,
Check: l.checks[id],
WriteRequest: structs.WriteRequest{Token: l.checkToken(id)},
}
var out struct{}
err := l.delegate.RPC("Catalog.Register", &req, &out)
if err == nil {
l.checkStatus[id] = syncStatus{inSync: true}
// Given how the register API works, this info is also updated
// every time we sync a check.
l.nodeInfoInSync = true
l.logger.Printf("[INFO] agent: Synced check '%s'", id)
} else if acl.IsErrPermissionDenied(err) {
l.checkStatus[id] = syncStatus{inSync: true}
l.logger.Printf("[WARN] agent: Check '%s' registration blocked by ACLs", id)
return nil
}
return err
}
func (l *localState) syncNodeInfo() error {
req := structs.RegisterRequest{
Datacenter: l.config.Datacenter,
2017-01-18 22:26:42 +00:00
ID: l.config.NodeID,
Node: l.config.NodeName,
Address: l.config.AdvertiseAddr,
TaggedAddresses: l.config.TaggedAddresses,
NodeMeta: l.metadata,
WriteRequest: structs.WriteRequest{Token: l.config.Tokens.AgentToken()},
}
var out struct{}
err := l.delegate.RPC("Catalog.Register", &req, &out)
if err == nil {
l.nodeInfoInSync = true
l.logger.Printf("[INFO] agent: Synced node info")
} else if acl.IsErrPermissionDenied(err) {
l.nodeInfoInSync = true
l.logger.Printf("[WARN] agent: Node info update blocked by ACLs")
return nil
}
return err
}