624 lines
18 KiB
Go
624 lines
18 KiB
Go
package consul
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"net"
|
|
"net/url"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/consul/api"
|
|
"github.com/hashicorp/go-multierror"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
var mark = struct{}{}
|
|
|
|
const (
|
|
// nomadServicePrefix is the first prefix that scopes all Nomad registered
|
|
// services
|
|
nomadServicePrefix = "_nomad"
|
|
|
|
// The periodic time interval for syncing services and checks with Consul
|
|
defaultSyncInterval = 6 * time.Second
|
|
|
|
// ttlCheckBuffer is the time interval that Nomad can take to report Consul
|
|
// the check result
|
|
ttlCheckBuffer = 31 * time.Second
|
|
|
|
// defaultShutdownWait is how long Shutdown() should block waiting for
|
|
// enqueued operations to sync to Consul by default.
|
|
defaultShutdownWait = time.Minute
|
|
|
|
// DefaultQueryWaitDuration is the max duration the Consul Agent will
|
|
// spend waiting for a response from a Consul Query.
|
|
DefaultQueryWaitDuration = 2 * time.Second
|
|
|
|
// ServiceTagHTTP is the tag assigned to HTTP services
|
|
ServiceTagHTTP = "http"
|
|
|
|
// ServiceTagRPC is the tag assigned to RPC services
|
|
ServiceTagRPC = "rpc"
|
|
|
|
// ServiceTagSerf is the tag assigned to Serf services
|
|
ServiceTagSerf = "serf"
|
|
)
|
|
|
|
// ScriptExecutor is the interface the ServiceClient uses to execute script
|
|
// checks inside a container.
|
|
type ScriptExecutor interface {
|
|
Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error)
|
|
}
|
|
|
|
// CatalogAPI is the consul/api.Catalog API used by Nomad.
|
|
type CatalogAPI interface {
|
|
Datacenters() ([]string, error)
|
|
Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
|
|
}
|
|
|
|
// AgentAPI is the consul/api.Agent API used by Nomad.
|
|
type AgentAPI interface {
|
|
CheckRegister(check *api.AgentCheckRegistration) error
|
|
CheckDeregister(checkID string) error
|
|
ServiceRegister(service *api.AgentServiceRegistration) error
|
|
ServiceDeregister(serviceID string) error
|
|
UpdateTTL(id, output, status string) error
|
|
}
|
|
|
|
// ServiceClient handles task and agent service registration with Consul.
|
|
type ServiceClient struct {
|
|
client AgentAPI
|
|
logger *log.Logger
|
|
retryInterval time.Duration
|
|
|
|
// runningCh is closed when the main Run loop exits
|
|
runningCh chan struct{}
|
|
|
|
// shutdownCh is closed when the client should shutdown
|
|
shutdownCh chan struct{}
|
|
|
|
// shutdownWait is how long Shutdown() blocks waiting for the final
|
|
// sync() to finish. Defaults to defaultShutdownWait
|
|
shutdownWait time.Duration
|
|
|
|
// syncCh triggers a sync in the main Run loop
|
|
syncCh chan struct{}
|
|
|
|
// pending service and check operations
|
|
pending *consulOps
|
|
opsLock sync.Mutex
|
|
|
|
// script check cancel funcs to be called before their corresponding
|
|
// check is removed. Only accessed in sync() so not covered by regLock
|
|
runningScripts map[string]*scriptHandle
|
|
|
|
// regLock must be held while accessing reg and dereg maps
|
|
regLock sync.Mutex
|
|
|
|
// Registered agent services and checks
|
|
agentServices map[string]struct{}
|
|
agentChecks map[string]struct{}
|
|
|
|
// agentLock must be held while accessing agent maps
|
|
agentLock sync.Mutex
|
|
}
|
|
|
|
// NewServiceClient creates a new Consul ServiceClient from an existing Consul API
|
|
// Client and logger.
|
|
func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient {
|
|
return &ServiceClient{
|
|
client: consulClient,
|
|
logger: logger,
|
|
retryInterval: defaultSyncInterval,
|
|
runningCh: make(chan struct{}),
|
|
shutdownCh: make(chan struct{}),
|
|
shutdownWait: defaultShutdownWait,
|
|
syncCh: make(chan struct{}, 1),
|
|
pending: newConsulOps(),
|
|
runningScripts: make(map[string]*scriptHandle),
|
|
agentServices: make(map[string]struct{}, 8),
|
|
agentChecks: make(map[string]struct{}, 8),
|
|
}
|
|
}
|
|
|
|
// Run the Consul main loop which retries operations against Consul. It should
|
|
// be called exactly once.
|
|
func (c *ServiceClient) Run() {
|
|
defer close(c.runningCh)
|
|
timer := time.NewTimer(0)
|
|
defer timer.Stop()
|
|
|
|
// Drain the initial tick so we don't sync until instructed
|
|
<-timer.C
|
|
|
|
lastOk := true
|
|
for {
|
|
select {
|
|
case <-c.syncCh:
|
|
timer.Reset(0)
|
|
case <-timer.C:
|
|
if err := c.sync(); err != nil {
|
|
if lastOk {
|
|
lastOk = false
|
|
c.logger.Printf("[WARN] consul: failed to update services in Consul: %v", err)
|
|
}
|
|
timer.Reset(c.retryInterval)
|
|
} else {
|
|
if !lastOk {
|
|
c.logger.Printf("[INFO] consul: successfully updated services in Consul")
|
|
lastOk = true
|
|
}
|
|
}
|
|
case <-c.shutdownCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// forceSync asynchronously causes a sync to happen. Any operations enqueued
|
|
// prior to calling forceSync will be synced.
|
|
func (c *ServiceClient) forceSync() {
|
|
select {
|
|
case c.syncCh <- mark:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// sync enqueued operations.
|
|
func (c *ServiceClient) sync() error {
|
|
c.opsLock.Lock()
|
|
ops := c.pending
|
|
c.pending = newConsulOps()
|
|
c.opsLock.Unlock()
|
|
|
|
var err error
|
|
|
|
msg := ops.String()
|
|
|
|
// Register Services
|
|
for id, service := range ops.regServices {
|
|
if err = c.client.ServiceRegister(service); err != nil {
|
|
goto ERROR
|
|
}
|
|
delete(ops.regServices, id)
|
|
}
|
|
|
|
// Register Checks
|
|
for id, check := range ops.regChecks {
|
|
if err = c.client.CheckRegister(check); err != nil {
|
|
goto ERROR
|
|
}
|
|
delete(ops.regChecks, id)
|
|
|
|
// Run the script for this check if one exists
|
|
if script, ok := ops.regScripts[id]; ok {
|
|
// This check is a script check; run it
|
|
c.runningScripts[id] = script.run()
|
|
}
|
|
}
|
|
|
|
// Deregister Checks
|
|
for id := range ops.deregChecks {
|
|
if h, ok := c.runningScripts[id]; ok {
|
|
// This check is a script check; stop it
|
|
h.cancel()
|
|
delete(c.runningScripts, id)
|
|
}
|
|
|
|
if err = c.client.CheckDeregister(id); err != nil {
|
|
goto ERROR
|
|
}
|
|
delete(ops.deregChecks, id)
|
|
}
|
|
|
|
// Deregister Services
|
|
for id := range ops.deregServices {
|
|
if err = c.client.ServiceDeregister(id); err != nil {
|
|
goto ERROR
|
|
}
|
|
delete(ops.deregServices, id)
|
|
}
|
|
|
|
c.logger.Printf("[DEBUG] consul: %s", msg)
|
|
return nil
|
|
|
|
//TODO Labels and gotos are nasty; move to a function?
|
|
ERROR:
|
|
// An error occurred, repopulate the operation maps but give
|
|
// precendence to new ops
|
|
c.opsLock.Lock()
|
|
ops.merge(c.pending)
|
|
c.pending = ops
|
|
c.opsLock.Unlock()
|
|
return err
|
|
}
|
|
|
|
// RegisterAgent registers Nomad agents (client or server). Script checks are
|
|
// not supported and will return an error. Registration is asynchronous.
|
|
//
|
|
// Agents will be deregistered when Shutdown is called.
|
|
func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
|
|
ops := newConsulOps()
|
|
|
|
for _, service := range services {
|
|
id := makeAgentServiceID(role, service)
|
|
host, rawport, err := net.SplitHostPort(service.PortLabel)
|
|
if err != nil {
|
|
return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
|
|
}
|
|
port, err := strconv.Atoi(rawport)
|
|
if err != nil {
|
|
return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
|
|
}
|
|
serviceReg := &api.AgentServiceRegistration{
|
|
ID: id,
|
|
Name: service.Name,
|
|
Tags: service.Tags,
|
|
Address: host,
|
|
Port: port,
|
|
}
|
|
ops.regServices[id] = serviceReg
|
|
|
|
for _, check := range service.Checks {
|
|
checkID := createCheckID(id, check)
|
|
if check.Type == structs.ServiceCheckScript {
|
|
return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
|
|
}
|
|
checkHost, checkPort := serviceReg.Address, serviceReg.Port
|
|
if check.PortLabel != "" {
|
|
host, rawport, err := net.SplitHostPort(check.PortLabel)
|
|
if err != nil {
|
|
return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
|
|
}
|
|
port, err := strconv.Atoi(rawport)
|
|
if err != nil {
|
|
return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
|
|
}
|
|
checkHost, checkPort = host, port
|
|
}
|
|
checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to add check %q: %v", check.Name, err)
|
|
}
|
|
ops.regChecks[checkID] = checkReg
|
|
}
|
|
}
|
|
|
|
// Now add them to the registration queue
|
|
c.opsLock.Lock()
|
|
c.pending.merge(ops)
|
|
c.opsLock.Unlock()
|
|
|
|
// Record IDs for deregistering on shutdown
|
|
c.agentLock.Lock()
|
|
for id := range ops.regServices {
|
|
c.agentServices[id] = mark
|
|
}
|
|
for id := range ops.regChecks {
|
|
c.agentChecks[id] = mark
|
|
}
|
|
c.agentLock.Unlock()
|
|
c.forceSync()
|
|
return nil
|
|
}
|
|
|
|
type addrParser func(portLabel string) (string, int)
|
|
|
|
// makeCheckReg adds a check reg to operations.
|
|
func (c *ServiceClient) makeCheckReg(ops *consulOps, check *structs.ServiceCheck,
|
|
service *api.AgentServiceRegistration, exec ScriptExecutor, parseAddr addrParser) error {
|
|
|
|
checkID := createCheckID(service.ID, check)
|
|
if check.Type == structs.ServiceCheckScript {
|
|
if exec == nil {
|
|
return fmt.Errorf("driver doesn't support script checks")
|
|
}
|
|
ops.regScripts[checkID] = newScriptCheck(
|
|
checkID, check, exec, c.client, c.logger, c.shutdownCh)
|
|
}
|
|
host, port := service.Address, service.Port
|
|
if check.PortLabel != "" {
|
|
host, port = parseAddr(check.PortLabel)
|
|
}
|
|
checkReg, err := createCheckReg(service.ID, checkID, check, host, port)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to add check %q: %v", check.Name, err)
|
|
}
|
|
ops.regChecks[checkID] = checkReg
|
|
return nil
|
|
}
|
|
|
|
// serviceRegs creates service registrations, check registrations, and script
|
|
// checks from a service.
|
|
func (c *ServiceClient) serviceRegs(ops *consulOps, allocID string, service *structs.Service,
|
|
exec ScriptExecutor, task *structs.Task) error {
|
|
|
|
id := makeTaskServiceID(allocID, task.Name, service)
|
|
host, port := task.FindHostAndPortFor(service.PortLabel)
|
|
serviceReg := &api.AgentServiceRegistration{
|
|
ID: id,
|
|
Name: service.Name,
|
|
Tags: make([]string, len(service.Tags)),
|
|
Address: host,
|
|
Port: port,
|
|
}
|
|
// copy isn't strictly necessary but can avoid bugs especially
|
|
// with tests that may reuse Tasks
|
|
copy(serviceReg.Tags, service.Tags)
|
|
ops.regServices[id] = serviceReg
|
|
|
|
for _, check := range service.Checks {
|
|
err := c.makeCheckReg(ops, check, serviceReg, exec, task.FindHostAndPortFor)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// RegisterTask with Consul. Adds all sevice entries and checks to Consul. If
|
|
// exec is nil and a script check exists an error is returned.
|
|
//
|
|
// Actual communication with Consul is done asynchrously (see Run).
|
|
func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, exec ScriptExecutor) error {
|
|
ops := newConsulOps()
|
|
for _, service := range task.Services {
|
|
if err := c.serviceRegs(ops, allocID, service, exec, task); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Now add them to the registration queue
|
|
c.opsLock.Lock()
|
|
c.pending.merge(ops)
|
|
c.opsLock.Unlock()
|
|
c.forceSync()
|
|
return nil
|
|
}
|
|
|
|
// UpdateTask in Consul. Does not alter the service if only checks have
|
|
// changed.
|
|
func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, exec ScriptExecutor) error {
|
|
ops := newConsulOps()
|
|
|
|
existingIDs := make(map[string]*structs.Service, len(existing.Services))
|
|
for _, s := range existing.Services {
|
|
existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s
|
|
c.logger.Printf("[XXX] EXISTING: %s", makeTaskServiceID(allocID, existing.Name, s))
|
|
}
|
|
newIDs := make(map[string]*structs.Service, len(newTask.Services))
|
|
for _, s := range newTask.Services {
|
|
newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s
|
|
c.logger.Printf("[XXX] UPDATED : %s", makeTaskServiceID(allocID, newTask.Name, s))
|
|
}
|
|
|
|
parseAddr := newTask.FindHostAndPortFor
|
|
|
|
// Loop over existing Service IDs to see if they have been removed or
|
|
// updated.
|
|
for existingID, existingSvc := range existingIDs {
|
|
newSvc, ok := newIDs[existingID]
|
|
if !ok {
|
|
// Existing sevice entry removed
|
|
ops.deregServices[existingID] = mark
|
|
for _, check := range existingSvc.Checks {
|
|
ops.deregChecks[createCheckID(existingID, check)] = mark
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Manipulating checks is cheap and easy, so just remove old and add new
|
|
for _, check := range existingSvc.Checks {
|
|
ops.deregChecks[createCheckID(existingID, check)] = mark
|
|
}
|
|
|
|
// Register new checks
|
|
for _, check := range newSvc.Checks {
|
|
checkID := createCheckID(existingID, check)
|
|
// Don't deregister this check if it hasn't changed
|
|
delete(ops.deregChecks, checkID)
|
|
if check.Type == structs.ServiceCheckScript {
|
|
if exec == nil {
|
|
return fmt.Errorf("driver doesn't support script checks")
|
|
}
|
|
ops.regScripts[checkID] = newScriptCheck(
|
|
checkID, check, exec, c.client, c.logger, c.shutdownCh)
|
|
}
|
|
host, port := parseAddr(existingSvc.PortLabel)
|
|
if check.PortLabel != "" {
|
|
host, port = parseAddr(check.PortLabel)
|
|
}
|
|
checkReg, err := createCheckReg(existingID, checkID, check, host, port)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ops.regChecks[checkID] = checkReg
|
|
}
|
|
|
|
// Service hasn't changed and checks are updated so don't
|
|
// process this service again later
|
|
delete(newIDs, existingID)
|
|
}
|
|
|
|
// Any remaining services should just be enqueued directly
|
|
for _, newSvc := range newIDs {
|
|
err := c.serviceRegs(ops, allocID, newSvc, exec, newTask)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Finally enqueue the updates and force sync
|
|
c.opsLock.Lock()
|
|
c.pending.merge(ops)
|
|
c.opsLock.Unlock()
|
|
|
|
c.forceSync()
|
|
return nil
|
|
}
|
|
|
|
// RemoveTask from Consul. Removes all service entries and checks.
|
|
//
|
|
// Actual communication with Consul is done asynchrously (see Run).
|
|
func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) {
|
|
ops := newConsulOps()
|
|
|
|
for _, service := range task.Services {
|
|
id := makeTaskServiceID(allocID, task.Name, service)
|
|
ops.deregServices[id] = mark
|
|
|
|
for _, check := range service.Checks {
|
|
ops.deregChecks[createCheckID(id, check)] = mark
|
|
}
|
|
}
|
|
|
|
// Now add them to the deregistration fields; main Run loop will update
|
|
c.regLock.Lock()
|
|
c.pending.merge(ops)
|
|
c.regLock.Unlock()
|
|
|
|
c.forceSync()
|
|
}
|
|
|
|
// Shutdown the Consul client. Update running task registations and deregister
|
|
// agent from Consul. Blocks up to shutdownWait before giving up on syncing
|
|
// operations.
|
|
func (c *ServiceClient) Shutdown() error {
|
|
select {
|
|
case <-c.shutdownCh:
|
|
return nil
|
|
default:
|
|
close(c.shutdownCh)
|
|
}
|
|
|
|
var mErr multierror.Error
|
|
|
|
// Don't let Shutdown block indefinitely
|
|
deadline := time.After(c.shutdownWait)
|
|
|
|
// Deregister agent services and checks
|
|
c.agentLock.Lock()
|
|
for id := range c.agentServices {
|
|
if err := c.client.ServiceDeregister(id); err != nil {
|
|
mErr.Errors = append(mErr.Errors, err)
|
|
}
|
|
}
|
|
|
|
// Deregister Checks
|
|
for id := range c.agentChecks {
|
|
if err := c.client.CheckDeregister(id); err != nil {
|
|
mErr.Errors = append(mErr.Errors, err)
|
|
}
|
|
}
|
|
c.agentLock.Unlock()
|
|
|
|
// Wait for Run to finish any outstanding sync() calls and exit
|
|
select {
|
|
case <-c.runningCh:
|
|
// sync one last time to ensure all enqueued operations are applied
|
|
if err := c.sync(); err != nil {
|
|
mErr.Errors = append(mErr.Errors, err)
|
|
}
|
|
case <-deadline:
|
|
// Don't wait forever though
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("timed out waiting for Consul operations to complete"))
|
|
return mErr.ErrorOrNil()
|
|
}
|
|
|
|
// Give script checks time to exit (no need to lock as Run() has exited)
|
|
for _, h := range c.runningScripts {
|
|
select {
|
|
case <-h.wait():
|
|
case <-deadline:
|
|
mErr.Errors = append(mErr.Errors, fmt.Errorf("timed out waiting for script checks to run"))
|
|
return mErr.ErrorOrNil()
|
|
}
|
|
}
|
|
return mErr.ErrorOrNil()
|
|
}
|
|
|
|
// makeAgentServiceID creates a unique ID for identifying an agent service in
|
|
// Consul.
|
|
//
|
|
// Agent service IDs are of the form:
|
|
//
|
|
// {nomadServicePrefix}-{ROLE}-{Service.Name}-{Service.Tags...}
|
|
// Example Server ID: _nomad-server-nomad-serf
|
|
// Example Client ID: _nomad-client-nomad-client-http
|
|
//
|
|
func makeAgentServiceID(role string, service *structs.Service) string {
|
|
parts := make([]string, len(service.Tags)+3)
|
|
parts[0] = nomadServicePrefix
|
|
parts[1] = role
|
|
parts[2] = service.Name
|
|
copy(parts[3:], service.Tags)
|
|
return strings.Join(parts, "-")
|
|
}
|
|
|
|
// makeTaskServiceID creates a unique ID for identifying a task service in
|
|
// Consul.
|
|
//
|
|
// Task service IDs are of the form:
|
|
//
|
|
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
|
|
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
|
|
//
|
|
func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
|
|
parts := make([]string, len(service.Tags)+5)
|
|
parts[0] = nomadServicePrefix
|
|
parts[1] = "executor"
|
|
parts[2] = allocID
|
|
parts[3] = taskName
|
|
parts[4] = service.Name
|
|
copy(parts[5:], service.Tags)
|
|
return strings.Join(parts, "-")
|
|
}
|
|
|
|
// createCheckID creates a unique ID for a check.
|
|
func createCheckID(serviceID string, check *structs.ServiceCheck) string {
|
|
return check.Hash(serviceID)
|
|
}
|
|
|
|
// createCheckReg creates a Check that can be registered with Consul.
|
|
//
|
|
// Only supports HTTP(S) and TCP checks. Script checks must be handled
|
|
// externally.
|
|
func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
|
|
chkReg := api.AgentCheckRegistration{
|
|
ID: checkID,
|
|
Name: check.Name,
|
|
ServiceID: serviceID,
|
|
}
|
|
chkReg.Status = check.InitialStatus
|
|
chkReg.Timeout = check.Timeout.String()
|
|
chkReg.Interval = check.Interval.String()
|
|
|
|
switch check.Type {
|
|
case structs.ServiceCheckHTTP:
|
|
if check.Protocol == "" {
|
|
check.Protocol = "http"
|
|
}
|
|
base := url.URL{
|
|
Scheme: check.Protocol,
|
|
Host: net.JoinHostPort(host, strconv.Itoa(port)),
|
|
}
|
|
relative, err := url.Parse(check.Path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
url := base.ResolveReference(relative)
|
|
chkReg.HTTP = url.String()
|
|
case structs.ServiceCheckTCP:
|
|
chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
|
|
case structs.ServiceCheckScript:
|
|
chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
|
|
default:
|
|
return nil, fmt.Errorf("check type %+q not valid", check.Type)
|
|
}
|
|
return &chkReg, nil
|
|
}
|