open-nomad/client/consul/sync.go

550 lines
16 KiB
Go
Raw Normal View History

2016-03-23 07:50:41 +00:00
package consul
import (
"crypto/tls"
"fmt"
"log"
"net/http"
2016-03-23 17:43:35 +00:00
"net/url"
"reflect"
2016-03-23 07:50:41 +00:00
"strings"
2016-03-24 01:03:02 +00:00
"sync"
2016-03-23 07:50:41 +00:00
"time"
consul "github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
2016-03-23 17:43:35 +00:00
"github.com/hashicorp/go-multierror"
2016-03-23 07:50:41 +00:00
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/hashicorp/nomad/nomad/types"
2016-03-23 07:50:41 +00:00
)
type notifyEvent struct{}
type notifyChannel chan notifyEvent
// Syncer allows syncing of services and checks with Consul
type Syncer struct {
2016-05-24 04:35:31 +00:00
client *consul.Client
runChecks bool
2016-03-23 07:50:41 +00:00
2016-05-14 07:36:26 +00:00
serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to
delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul
createCheck func(*structs.ServiceCheck, string) (Check, error)
2016-05-14 07:36:26 +00:00
addrFinder func(portLabel string) (string, int)
2016-03-23 07:50:41 +00:00
2016-03-24 01:21:27 +00:00
trackedServices map[string]*consul.AgentService
trackedChecks map[string]*consul.AgentCheckRegistration
2016-03-25 04:17:33 +00:00
checkRunners map[string]*CheckRunner
2016-03-23 07:50:41 +00:00
2016-03-24 01:03:02 +00:00
logger *log.Logger
shutdownCh types.ShutdownChannel
2016-03-24 01:03:02 +00:00
shutdown bool
shutdownLock sync.Mutex
2016-03-23 07:50:41 +00:00
// periodicCallbacks is walked sequentially when the timer in Run
// fires.
periodicCallbacks map[string]types.PeriodicCallback
notifySyncCh notifyChannel
periodicLock sync.RWMutex
2016-03-23 07:50:41 +00:00
}
const (
// initialSyncBuffer is the max time an initial sync will sleep
// before syncing.
initialSyncBuffer = 30 * time.Second
// initialSyncDelay is the delay before an initial sync.
initialSyncDelay = 5 * time.Second
2016-03-23 21:34:43 +00:00
// The periodic time interval for syncing services and checks with Consul
2016-03-23 07:50:41 +00:00
syncInterval = 5 * time.Second
2016-03-25 02:30:02 +00:00
// syncJitter provides a little variance in the frequency at which
// Syncer polls Consul.
syncJitter = 8
2016-03-25 02:30:02 +00:00
// ttlCheckBuffer is the time interval that Nomad can take to report Consul
// the check result
ttlCheckBuffer = 31 * time.Second
2016-03-23 07:50:41 +00:00
)
2016-05-23 14:29:08 +00:00
// NewSyncer returns a new consul.Syncer
func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) {
2016-03-23 07:50:41 +00:00
var err error
var c *consul.Client
cfg := consul.DefaultConfig()
if config.Addr != "" {
cfg.Address = config.Addr
}
if config.Token != "" {
cfg.Token = config.Token
}
if config.Auth != "" {
var username, password string
if strings.Contains(config.Auth, ":") {
split := strings.SplitN(config.Auth, ":", 2)
username = split[0]
password = split[1]
} else {
username = config.Auth
}
cfg.HttpAuth = &consul.HttpBasicAuth{
Username: username,
Password: password,
}
}
if config.EnableSSL {
cfg.Scheme = "https"
tlsCfg := consul.TLSConfig{
Address: cfg.Address,
CAFile: config.CAFile,
CertFile: config.CertFile,
KeyFile: config.KeyFile,
InsecureSkipVerify: !config.VerifySSL,
}
tlsClientCfg, err := consul.SetupTLSConfig(&tlsCfg)
if err != nil {
return nil, fmt.Errorf("error creating tls client config for consul: %v", err)
}
cfg.HttpClient.Transport = &http.Transport{
TLSClientConfig: tlsClientCfg,
}
2016-03-23 07:50:41 +00:00
}
if config.EnableSSL && !config.VerifySSL {
cfg.HttpClient.Transport = &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
}
}
if c, err = consul.NewClient(cfg); err != nil {
return nil, err
}
consulSyncer := Syncer{
client: c,
logger: logger,
trackedServices: make(map[string]*consul.AgentService),
trackedChecks: make(map[string]*consul.AgentCheckRegistration),
checkRunners: make(map[string]*CheckRunner),
shutdownCh: make(types.ShutdownChannel),
periodicCallbacks: make(map[string]types.PeriodicCallback),
2016-03-23 07:50:41 +00:00
}
return &consulSyncer, nil
2016-03-23 07:50:41 +00:00
}
2016-03-25 02:30:02 +00:00
// SetDelegatedChecks sets the checks that nomad is going to run and report the
// result back to consul
func (c *Syncer) SetDelegatedChecks(delegateChecks map[string]struct{}, createCheck func(*structs.ServiceCheck, string) (Check, error)) *Syncer {
c.delegateChecks = delegateChecks
c.createCheck = createCheck
return c
}
2016-05-14 07:36:26 +00:00
// SetAddrFinder sets a function to find the host and port for a Service given its port label
func (c *Syncer) SetAddrFinder(addrFinder func(string) (string, int)) *Syncer {
c.addrFinder = addrFinder
return c
}
// SetServiceIdentifier sets the identifier of the services we are syncing with Consul
func (c *Syncer) SetServiceIdentifier(serviceIdentifier string) *Syncer {
c.serviceIdentifier = serviceIdentifier
return c
}
// SyncNow expires the current timer forcing the list of periodic callbacks
// to be synced immediately.
func (c *Syncer) SyncNow() {
select {
case c.notifySyncCh <- notifyEvent{}:
default:
}
}
// SyncServices sync the services with the Consul Agent
func (c *Syncer) SyncServices(services []*structs.Service) error {
2016-03-23 17:43:35 +00:00
var mErr multierror.Error
2016-03-24 01:21:27 +00:00
taskServices := make(map[string]*consul.AgentService)
taskChecks := make(map[string]*consul.AgentCheckRegistration)
2016-03-23 17:43:35 +00:00
// Register Services and Checks that we don't know about or has changed
for _, service := range services {
2016-03-23 17:43:35 +00:00
srv, err := c.createService(service)
if err != nil {
mErr.Errors = append(mErr.Errors, err)
continue
}
2016-03-24 01:21:27 +00:00
trackedService, ok := c.trackedServices[srv.ID]
2016-03-23 17:43:35 +00:00
if (ok && !reflect.DeepEqual(trackedService, srv)) || !ok {
if err := c.registerService(srv); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
2016-03-23 17:43:35 +00:00
}
2016-03-24 01:21:27 +00:00
c.trackedServices[srv.ID] = srv
taskServices[srv.ID] = srv
2016-03-23 17:43:35 +00:00
for _, chk := range service.Checks {
// Create a consul check registration
2016-03-24 22:09:50 +00:00
chkReg, err := c.createCheckReg(chk, srv)
if err != nil {
mErr.Errors = append(mErr.Errors, err)
continue
}
// creating a nomad check if we have to handle this particular check type
if _, ok := c.delegateChecks[chk.Type]; ok {
nc, err := c.createCheck(chk, chkReg.ID)
if err != nil {
mErr.Errors = append(mErr.Errors, err)
continue
}
cr := NewCheckRunner(nc, c.runCheck, c.logger)
c.checkRunners[nc.ID()] = cr
}
if _, ok := c.trackedChecks[chkReg.ID]; !ok {
if err := c.registerCheck(chkReg); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
2016-03-23 17:43:35 +00:00
}
c.trackedChecks[chkReg.ID] = chkReg
taskChecks[chkReg.ID] = chkReg
2016-03-23 17:43:35 +00:00
}
}
// Remove services that are not present anymore
2016-03-24 01:21:27 +00:00
for _, service := range c.trackedServices {
if _, ok := taskServices[service.ID]; !ok {
if err := c.deregisterService(service.ID); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
2016-03-24 01:21:27 +00:00
delete(c.trackedServices, service.ID)
2016-03-23 17:43:35 +00:00
}
}
// Remove the checks that are not present anymore
2016-03-24 01:21:27 +00:00
for checkID, _ := range c.trackedChecks {
if _, ok := taskChecks[checkID]; !ok {
if err := c.deregisterCheck(checkID); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
2016-03-24 01:21:27 +00:00
delete(c.trackedChecks, checkID)
2016-03-23 17:43:35 +00:00
}
}
return mErr.ErrorOrNil()
2016-03-23 07:50:41 +00:00
}
2016-03-23 21:34:43 +00:00
// Shutdown de-registers the services and checks and shuts down periodic syncing
func (c *Syncer) Shutdown() error {
2016-03-23 17:43:35 +00:00
var mErr multierror.Error
2016-03-24 01:03:02 +00:00
c.shutdownLock.Lock()
if !c.shutdown {
close(c.shutdownCh)
c.shutdown = true
}
2016-03-24 01:03:02 +00:00
c.shutdownLock.Unlock()
2016-03-25 02:19:13 +00:00
// Stop all the checks that nomad is running
2016-03-25 04:17:33 +00:00
for _, cr := range c.checkRunners {
cr.Stop()
2016-03-25 02:19:13 +00:00
}
2016-03-25 23:11:45 +00:00
// De-register all the services from consul
2016-03-24 01:21:27 +00:00
for _, service := range c.trackedServices {
2016-03-23 17:43:35 +00:00
if err := c.client.Agent().ServiceDeregister(service.ID); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
}
return mErr.ErrorOrNil()
2016-03-23 07:50:41 +00:00
}
2016-03-24 01:21:27 +00:00
// KeepServices removes services from consul which are not present in the list
// of tasks passed to it
func (c *Syncer) KeepServices(services map[string]struct{}) error {
var mErr multierror.Error
// Get the services from Consul
cServices, err := c.client.Agent().Services()
if err != nil {
return err
}
cServices = c.filterConsulServices(cServices)
// Remove the services from consul which are not in any of the tasks
for _, service := range cServices {
if _, validService := services[service.ID]; !validService {
if err := c.deregisterService(service.ID); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
}
}
return mErr.ErrorOrNil()
}
2016-03-23 21:34:43 +00:00
// registerCheck registers a check definition with Consul
func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error {
2016-03-25 04:17:33 +00:00
if cr, ok := c.checkRunners[chkReg.ID]; ok {
cr.Start()
2016-03-25 02:00:24 +00:00
}
return c.client.Agent().CheckRegister(chkReg)
}
2016-03-25 02:30:02 +00:00
// createCheckReg creates a Check that can be registered with Nomad. It also
// creates a Nomad check for the check types that it can handle.
func (c *Syncer) createCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) {
2016-03-23 17:43:35 +00:00
chkReg := consul.AgentCheckRegistration{
ID: check.Hash(service.ID),
2016-03-23 17:43:35 +00:00
Name: check.Name,
ServiceID: service.ID,
}
chkReg.Timeout = check.Timeout.String()
chkReg.Interval = check.Interval.String()
2016-03-23 17:43:35 +00:00
switch check.Type {
case structs.ServiceCheckHTTP:
if check.Protocol == "" {
check.Protocol = "http"
}
url := url.URL{
Scheme: check.Protocol,
Host: fmt.Sprintf("%s:%d", service.Address, service.Port),
Path: check.Path,
}
chkReg.HTTP = url.String()
case structs.ServiceCheckTCP:
chkReg.TCP = fmt.Sprintf("%s:%d", service.Address, service.Port)
case structs.ServiceCheckScript:
2016-03-25 02:30:02 +00:00
chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
2016-03-24 22:09:50 +00:00
default:
return nil, fmt.Errorf("check type %q not valid", check.Type)
2016-03-24 20:05:08 +00:00
}
2016-03-24 22:09:50 +00:00
return &chkReg, nil
2016-03-23 07:50:41 +00:00
}
2016-03-23 17:43:35 +00:00
// createService creates a Consul AgentService from a Nomad Service
func (c *Syncer) createService(service *structs.Service) (*consul.AgentService, error) {
2016-03-23 07:50:41 +00:00
srv := consul.AgentService{
ID: service.ID(c.serviceIdentifier),
2016-03-23 07:50:41 +00:00
Service: service.Name,
Tags: service.Tags,
}
host, port := c.addrFinder(service.PortLabel)
if host != "" {
srv.Address = host
}
if port != 0 {
srv.Port = port
}
2016-03-23 07:50:41 +00:00
return &srv, nil
}
2016-03-23 17:43:35 +00:00
// registerService registers a service with Consul
func (c *Syncer) registerService(service *consul.AgentService) error {
2016-03-23 17:43:35 +00:00
srvReg := consul.AgentServiceRegistration{
ID: service.ID,
Name: service.Service,
Tags: service.Tags,
Port: service.Port,
Address: service.Address,
}
return c.client.Agent().ServiceRegister(&srvReg)
}
// deregisterService de-registers a service with the given ID from consul
func (c *Syncer) deregisterService(ID string) error {
2016-03-23 17:43:35 +00:00
return c.client.Agent().ServiceDeregister(ID)
}
2016-03-23 21:34:43 +00:00
// deregisterCheck de-registers a check with a given ID from Consul.
func (c *Syncer) deregisterCheck(ID string) error {
2016-03-25 02:00:24 +00:00
// Deleting the nomad check
2016-03-25 04:17:33 +00:00
if cr, ok := c.checkRunners[ID]; ok {
cr.Stop()
delete(c.checkRunners, ID)
2016-03-24 20:05:08 +00:00
}
2016-03-25 02:00:24 +00:00
2016-03-25 23:11:45 +00:00
// Deleting from consul
2016-03-23 17:43:35 +00:00
return c.client.Agent().CheckDeregister(ID)
}
// Run triggers periodic syncing of services and checks with Consul. This is
// a long lived go-routine which is stopped during shutdown.
func (c *Syncer) Run() {
d := initialSyncDelay + lib.RandomStagger(initialSyncBuffer-initialSyncDelay)
sync := time.NewTimer(d)
c.logger.Printf("[DEBUG] consul.sync: sleeping %v before first sync", d)
2016-03-23 07:50:41 +00:00
for {
select {
case <-sync.C:
d = syncInterval - lib.RandomStagger(syncInterval/syncJitter)
sync.Reset(d)
2016-03-23 18:31:04 +00:00
if err := c.performSync(); err != nil {
2016-05-24 04:35:31 +00:00
if c.runChecks {
c.logger.Printf("[DEBUG] consul.sync: disabling checks until Consul sync completes for %q: %v", c.serviceIdentifier, err)
2016-04-02 21:48:10 +00:00
}
2016-05-24 04:35:31 +00:00
c.runChecks = false
2016-04-02 21:48:10 +00:00
} else {
2016-05-24 04:35:31 +00:00
c.runChecks = true
2016-03-23 07:50:41 +00:00
}
case <-c.notifySyncCh:
sync.Reset(syncInterval)
2016-03-23 07:50:41 +00:00
case <-c.shutdownCh:
sync.Stop()
c.logger.Printf("[INFO] consul.sync: shutting down sync for %q", c.serviceIdentifier)
2016-03-23 07:50:41 +00:00
return
}
}
}
// RunHandlers executes each handler (randomly)
func (c *Syncer) RunHandlers() {
c.periodicLock.RLock()
handlers := make(map[string]types.PeriodicCallback, len(c.periodicCallbacks))
for name, fn := range c.periodicCallbacks {
handlers[name] = fn
}
c.periodicLock.RUnlock()
for _, fn := range handlers {
fn()
}
}
2016-03-23 21:34:43 +00:00
// performSync sync the services and checks we are tracking with Consul.
func (c *Syncer) performSync() error {
c.RunHandlers()
2016-03-23 18:31:04 +00:00
var mErr multierror.Error
2016-03-23 17:43:35 +00:00
cServices, err := c.client.Agent().Services()
if err != nil {
2016-03-23 18:31:04 +00:00
return err
2016-03-23 17:43:35 +00:00
}
cChecks, err := c.client.Agent().Checks()
if err != nil {
2016-03-23 18:31:04 +00:00
return err
2016-03-23 17:43:35 +00:00
}
// Add services and checks that consul doesn't have but we do
2016-03-24 01:21:27 +00:00
for serviceID, service := range c.trackedServices {
2016-03-23 17:43:35 +00:00
if _, ok := cServices[serviceID]; !ok {
2016-03-23 18:31:04 +00:00
if err := c.registerService(service); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
2016-03-23 17:43:35 +00:00
}
}
2016-03-24 01:21:27 +00:00
for checkID, check := range c.trackedChecks {
if _, ok := cChecks[checkID]; !ok {
if err := c.registerCheck(check); err != nil {
2016-03-23 18:31:04 +00:00
mErr.Errors = append(mErr.Errors, err)
}
2016-03-23 17:43:35 +00:00
}
}
2016-03-23 18:31:04 +00:00
return mErr.ErrorOrNil()
2016-03-23 17:43:35 +00:00
}
// filterConsulServices prunes out all the service whose ids are not prefixed
// with nomad-
func (c *Syncer) filterConsulServices(srvcs map[string]*consul.AgentService) map[string]*consul.AgentService {
2016-03-23 17:43:35 +00:00
nomadServices := make(map[string]*consul.AgentService)
for _, srv := range srvcs {
if strings.HasPrefix(srv.ID, structs.NomadConsulPrefix) &&
!strings.HasPrefix(srv.ID, structs.AgentServicePrefix) {
2016-03-23 17:43:35 +00:00
nomadServices[srv.ID] = srv
}
}
return nomadServices
}
// filterConsulChecks prunes out all the consul checks which do not have
// services with id prefixed with noamd-
func (c *Syncer) filterConsulChecks(chks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck {
2016-03-23 17:43:35 +00:00
nomadChecks := make(map[string]*consul.AgentCheck)
for _, chk := range chks {
if strings.HasPrefix(chk.ServiceID, structs.NomadConsulPrefix) {
nomadChecks[chk.CheckID] = chk
}
}
return nomadChecks
2016-03-23 07:50:41 +00:00
}
2016-03-23 22:36:46 +00:00
2016-03-24 20:05:08 +00:00
// consulPresent indicates whether the consul agent is responding
func (c *Syncer) consulPresent() bool {
2016-03-23 22:36:46 +00:00
_, err := c.client.Agent().Self()
return err == nil
}
2016-03-24 20:05:08 +00:00
// runCheck runs a check and updates the corresponding ttl check in consul
func (c *Syncer) runCheck(check Check) {
2016-03-24 20:05:08 +00:00
res := check.Run()
if res.Duration >= check.Timeout() {
2016-05-05 17:45:02 +00:00
c.logger.Printf("[DEBUG] consul.sync: check took time: %v, timeout: %v", res.Duration, check.Timeout())
}
2016-03-25 02:00:24 +00:00
state := consul.HealthCritical
output := res.Output
switch res.ExitCode {
case 0:
2016-03-25 02:00:24 +00:00
state = consul.HealthPassing
case 1:
2016-03-25 02:00:24 +00:00
state = consul.HealthWarning
default:
state = consul.HealthCritical
2016-03-25 02:00:24 +00:00
}
if res.Err != nil {
state = consul.HealthCritical
output = res.Err.Error()
}
2016-03-25 02:00:24 +00:00
if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil {
2016-05-24 04:35:31 +00:00
if c.runChecks {
c.logger.Printf("[DEBUG] consul.sync: check %q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err)
2016-05-24 04:35:31 +00:00
c.runChecks = false
2016-04-02 21:48:10 +00:00
} else {
2016-05-24 04:35:31 +00:00
c.runChecks = true
2016-04-02 21:48:10 +00:00
}
2016-03-24 20:05:08 +00:00
}
}
2016-05-14 07:36:26 +00:00
// GenerateServiceIdentifier returns a service identifier based on an allocation
2016-05-14 07:36:26 +00:00
// id and task name
func GenerateServiceIdentifier(allocID string, taskName string) string {
2016-05-14 07:36:26 +00:00
return fmt.Sprintf("%s-%s", taskName, allocID)
}
// AddPeriodicHandler adds a uniquely named callback. Returns true if
// successful, false if a handler with the same name already exists.
func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool {
c.periodicLock.Lock()
defer c.periodicLock.Unlock()
c.logger.Printf("[DEBUG] consul.sync: adding handler named %s", name)
if _, found := c.periodicCallbacks[name]; found {
c.logger.Printf("[ERROR] consul.sync: failed adding handler %q", name)
return false
}
c.periodicCallbacks[name] = fn
c.logger.Printf("[DEBUG] consul.sync: successfully added handler %q", name)
return true
}
func (c *Syncer) NumHandlers() int {
c.periodicLock.RLock()
defer c.periodicLock.RUnlock()
return len(c.periodicCallbacks)
}
// RemovePeriodicHandler removes a handler with a given name.
func (c *Syncer) RemovePeriodicHandler(name string) {
c.periodicLock.Lock()
defer c.periodicLock.Unlock()
delete(c.periodicCallbacks, name)
}
func (c *Syncer) ConsulClient() *consul.Client {
return c.client
}