open-nomad/command/agent/consul/client.go
2017-04-19 12:42:47 -07:00

624 lines
18 KiB
Go

package consul
import (
"context"
"fmt"
"log"
"net"
"net/url"
"strconv"
"strings"
"sync"
"time"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/nomad/structs"
)
var mark = struct{}{}
const (
// nomadServicePrefix is the first prefix that scopes all Nomad registered
// services
nomadServicePrefix = "_nomad"
// The periodic time interval for syncing services and checks with Consul
defaultSyncInterval = 6 * time.Second
// ttlCheckBuffer is the time interval that Nomad can take to report Consul
// the check result
ttlCheckBuffer = 31 * time.Second
// defaultShutdownWait is how long Shutdown() should block waiting for
// enqueued operations to sync to Consul by default.
defaultShutdownWait = time.Minute
// DefaultQueryWaitDuration is the max duration the Consul Agent will
// spend waiting for a response from a Consul Query.
DefaultQueryWaitDuration = 2 * time.Second
// ServiceTagHTTP is the tag assigned to HTTP services
ServiceTagHTTP = "http"
// ServiceTagRPC is the tag assigned to RPC services
ServiceTagRPC = "rpc"
// ServiceTagSerf is the tag assigned to Serf services
ServiceTagSerf = "serf"
)
// ScriptExecutor is the interface the ServiceClient uses to execute script
// checks inside a container.
type ScriptExecutor interface {
Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error)
}
// CatalogAPI is the consul/api.Catalog API used by Nomad.
type CatalogAPI interface {
Datacenters() ([]string, error)
Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
}
// AgentAPI is the consul/api.Agent API used by Nomad.
type AgentAPI interface {
CheckRegister(check *api.AgentCheckRegistration) error
CheckDeregister(checkID string) error
ServiceRegister(service *api.AgentServiceRegistration) error
ServiceDeregister(serviceID string) error
UpdateTTL(id, output, status string) error
}
// ServiceClient handles task and agent service registration with Consul.
type ServiceClient struct {
client AgentAPI
logger *log.Logger
retryInterval time.Duration
// runningCh is closed when the main Run loop exits
runningCh chan struct{}
// shutdownCh is closed when the client should shutdown
shutdownCh chan struct{}
// shutdownWait is how long Shutdown() blocks waiting for the final
// sync() to finish. Defaults to defaultShutdownWait
shutdownWait time.Duration
// syncCh triggers a sync in the main Run loop
syncCh chan struct{}
// pending service and check operations
pending *consulOps
opsLock sync.Mutex
// script check cancel funcs to be called before their corresponding
// check is removed. Only accessed in sync() so not covered by regLock
runningScripts map[string]*scriptHandle
// regLock must be held while accessing reg and dereg maps
regLock sync.Mutex
// Registered agent services and checks
agentServices map[string]struct{}
agentChecks map[string]struct{}
// agentLock must be held while accessing agent maps
agentLock sync.Mutex
}
// NewServiceClient creates a new Consul ServiceClient from an existing Consul API
// Client and logger.
func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient {
return &ServiceClient{
client: consulClient,
logger: logger,
retryInterval: defaultSyncInterval,
runningCh: make(chan struct{}),
shutdownCh: make(chan struct{}),
shutdownWait: defaultShutdownWait,
syncCh: make(chan struct{}, 1),
pending: newConsulOps(),
runningScripts: make(map[string]*scriptHandle),
agentServices: make(map[string]struct{}, 8),
agentChecks: make(map[string]struct{}, 8),
}
}
// Run the Consul main loop which retries operations against Consul. It should
// be called exactly once.
func (c *ServiceClient) Run() {
defer close(c.runningCh)
timer := time.NewTimer(0)
defer timer.Stop()
// Drain the initial tick so we don't sync until instructed
<-timer.C
lastOk := true
for {
select {
case <-c.syncCh:
timer.Reset(0)
case <-timer.C:
if err := c.sync(); err != nil {
if lastOk {
lastOk = false
c.logger.Printf("[WARN] consul: failed to update services in Consul: %v", err)
}
timer.Reset(c.retryInterval)
} else {
if !lastOk {
c.logger.Printf("[INFO] consul: successfully updated services in Consul")
lastOk = true
}
}
case <-c.shutdownCh:
return
}
}
}
// forceSync asynchronously causes a sync to happen. Any operations enqueued
// prior to calling forceSync will be synced.
func (c *ServiceClient) forceSync() {
select {
case c.syncCh <- mark:
default:
}
}
// sync enqueued operations.
func (c *ServiceClient) sync() error {
c.opsLock.Lock()
ops := c.pending
c.pending = newConsulOps()
c.opsLock.Unlock()
var err error
msg := ops.String()
// Register Services
for id, service := range ops.regServices {
if err = c.client.ServiceRegister(service); err != nil {
goto ERROR
}
delete(ops.regServices, id)
}
// Register Checks
for id, check := range ops.regChecks {
if err = c.client.CheckRegister(check); err != nil {
goto ERROR
}
delete(ops.regChecks, id)
// Run the script for this check if one exists
if script, ok := ops.regScripts[id]; ok {
// This check is a script check; run it
c.runningScripts[id] = script.run()
}
}
// Deregister Checks
for id := range ops.deregChecks {
if h, ok := c.runningScripts[id]; ok {
// This check is a script check; stop it
h.cancel()
delete(c.runningScripts, id)
}
if err = c.client.CheckDeregister(id); err != nil {
goto ERROR
}
delete(ops.deregChecks, id)
}
// Deregister Services
for id := range ops.deregServices {
if err = c.client.ServiceDeregister(id); err != nil {
goto ERROR
}
delete(ops.deregServices, id)
}
c.logger.Printf("[DEBUG] consul: %s", msg)
return nil
//TODO Labels and gotos are nasty; move to a function?
ERROR:
// An error occurred, repopulate the operation maps but give
// precendence to new ops
c.opsLock.Lock()
ops.merge(c.pending)
c.pending = ops
c.opsLock.Unlock()
return err
}
// RegisterAgent registers Nomad agents (client or server). Script checks are
// not supported and will return an error. Registration is asynchronous.
//
// Agents will be deregistered when Shutdown is called.
func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
ops := newConsulOps()
for _, service := range services {
id := makeAgentServiceID(role, service)
host, rawport, err := net.SplitHostPort(service.PortLabel)
if err != nil {
return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
}
port, err := strconv.Atoi(rawport)
if err != nil {
return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
}
serviceReg := &api.AgentServiceRegistration{
ID: id,
Name: service.Name,
Tags: service.Tags,
Address: host,
Port: port,
}
ops.regServices[id] = serviceReg
for _, check := range service.Checks {
checkID := createCheckID(id, check)
if check.Type == structs.ServiceCheckScript {
return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
}
checkHost, checkPort := serviceReg.Address, serviceReg.Port
if check.PortLabel != "" {
host, rawport, err := net.SplitHostPort(check.PortLabel)
if err != nil {
return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
}
port, err := strconv.Atoi(rawport)
if err != nil {
return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
}
checkHost, checkPort = host, port
}
checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
if err != nil {
return fmt.Errorf("failed to add check %q: %v", check.Name, err)
}
ops.regChecks[checkID] = checkReg
}
}
// Now add them to the registration queue
c.opsLock.Lock()
c.pending.merge(ops)
c.opsLock.Unlock()
// Record IDs for deregistering on shutdown
c.agentLock.Lock()
for id := range ops.regServices {
c.agentServices[id] = mark
}
for id := range ops.regChecks {
c.agentChecks[id] = mark
}
c.agentLock.Unlock()
c.forceSync()
return nil
}
type addrParser func(portLabel string) (string, int)
// makeCheckReg adds a check reg to operations.
func (c *ServiceClient) makeCheckReg(ops *consulOps, check *structs.ServiceCheck,
service *api.AgentServiceRegistration, exec ScriptExecutor, parseAddr addrParser) error {
checkID := createCheckID(service.ID, check)
if check.Type == structs.ServiceCheckScript {
if exec == nil {
return fmt.Errorf("driver doesn't support script checks")
}
ops.regScripts[checkID] = newScriptCheck(
checkID, check, exec, c.client, c.logger, c.shutdownCh)
}
host, port := service.Address, service.Port
if check.PortLabel != "" {
host, port = parseAddr(check.PortLabel)
}
checkReg, err := createCheckReg(service.ID, checkID, check, host, port)
if err != nil {
return fmt.Errorf("failed to add check %q: %v", check.Name, err)
}
ops.regChecks[checkID] = checkReg
return nil
}
// serviceRegs creates service registrations, check registrations, and script
// checks from a service.
func (c *ServiceClient) serviceRegs(ops *consulOps, allocID string, service *structs.Service,
exec ScriptExecutor, task *structs.Task) error {
id := makeTaskServiceID(allocID, task.Name, service)
host, port := task.FindHostAndPortFor(service.PortLabel)
serviceReg := &api.AgentServiceRegistration{
ID: id,
Name: service.Name,
Tags: make([]string, len(service.Tags)),
Address: host,
Port: port,
}
// copy isn't strictly necessary but can avoid bugs especially
// with tests that may reuse Tasks
copy(serviceReg.Tags, service.Tags)
ops.regServices[id] = serviceReg
for _, check := range service.Checks {
err := c.makeCheckReg(ops, check, serviceReg, exec, task.FindHostAndPortFor)
if err != nil {
return err
}
}
return nil
}
// RegisterTask with Consul. Adds all sevice entries and checks to Consul. If
// exec is nil and a script check exists an error is returned.
//
// Actual communication with Consul is done asynchrously (see Run).
func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, exec ScriptExecutor) error {
ops := newConsulOps()
for _, service := range task.Services {
if err := c.serviceRegs(ops, allocID, service, exec, task); err != nil {
return err
}
}
// Now add them to the registration queue
c.opsLock.Lock()
c.pending.merge(ops)
c.opsLock.Unlock()
c.forceSync()
return nil
}
// UpdateTask in Consul. Does not alter the service if only checks have
// changed.
func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, exec ScriptExecutor) error {
ops := newConsulOps()
existingIDs := make(map[string]*structs.Service, len(existing.Services))
for _, s := range existing.Services {
existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s
c.logger.Printf("[XXX] EXISTING: %s", makeTaskServiceID(allocID, existing.Name, s))
}
newIDs := make(map[string]*structs.Service, len(newTask.Services))
for _, s := range newTask.Services {
newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s
c.logger.Printf("[XXX] UPDATED : %s", makeTaskServiceID(allocID, newTask.Name, s))
}
parseAddr := newTask.FindHostAndPortFor
// Loop over existing Service IDs to see if they have been removed or
// updated.
for existingID, existingSvc := range existingIDs {
newSvc, ok := newIDs[existingID]
if !ok {
// Existing sevice entry removed
ops.deregServices[existingID] = mark
for _, check := range existingSvc.Checks {
ops.deregChecks[createCheckID(existingID, check)] = mark
}
continue
}
// Manipulating checks is cheap and easy, so just remove old and add new
for _, check := range existingSvc.Checks {
ops.deregChecks[createCheckID(existingID, check)] = mark
}
// Register new checks
for _, check := range newSvc.Checks {
checkID := createCheckID(existingID, check)
// Don't deregister this check if it hasn't changed
delete(ops.deregChecks, checkID)
if check.Type == structs.ServiceCheckScript {
if exec == nil {
return fmt.Errorf("driver doesn't support script checks")
}
ops.regScripts[checkID] = newScriptCheck(
checkID, check, exec, c.client, c.logger, c.shutdownCh)
}
host, port := parseAddr(existingSvc.PortLabel)
if check.PortLabel != "" {
host, port = parseAddr(check.PortLabel)
}
checkReg, err := createCheckReg(existingID, checkID, check, host, port)
if err != nil {
return err
}
ops.regChecks[checkID] = checkReg
}
// Service hasn't changed and checks are updated so don't
// process this service again later
delete(newIDs, existingID)
}
// Any remaining services should just be enqueued directly
for _, newSvc := range newIDs {
err := c.serviceRegs(ops, allocID, newSvc, exec, newTask)
if err != nil {
return err
}
}
// Finally enqueue the updates and force sync
c.opsLock.Lock()
c.pending.merge(ops)
c.opsLock.Unlock()
c.forceSync()
return nil
}
// RemoveTask from Consul. Removes all service entries and checks.
//
// Actual communication with Consul is done asynchrously (see Run).
func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) {
ops := newConsulOps()
for _, service := range task.Services {
id := makeTaskServiceID(allocID, task.Name, service)
ops.deregServices[id] = mark
for _, check := range service.Checks {
ops.deregChecks[createCheckID(id, check)] = mark
}
}
// Now add them to the deregistration fields; main Run loop will update
c.regLock.Lock()
c.pending.merge(ops)
c.regLock.Unlock()
c.forceSync()
}
// Shutdown the Consul client. Update running task registations and deregister
// agent from Consul. Blocks up to shutdownWait before giving up on syncing
// operations.
func (c *ServiceClient) Shutdown() error {
select {
case <-c.shutdownCh:
return nil
default:
close(c.shutdownCh)
}
var mErr multierror.Error
// Don't let Shutdown block indefinitely
deadline := time.After(c.shutdownWait)
// Deregister agent services and checks
c.agentLock.Lock()
for id := range c.agentServices {
if err := c.client.ServiceDeregister(id); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
}
// Deregister Checks
for id := range c.agentChecks {
if err := c.client.CheckDeregister(id); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
}
c.agentLock.Unlock()
// Wait for Run to finish any outstanding sync() calls and exit
select {
case <-c.runningCh:
// sync one last time to ensure all enqueued operations are applied
if err := c.sync(); err != nil {
mErr.Errors = append(mErr.Errors, err)
}
case <-deadline:
// Don't wait forever though
mErr.Errors = append(mErr.Errors, fmt.Errorf("timed out waiting for Consul operations to complete"))
return mErr.ErrorOrNil()
}
// Give script checks time to exit (no need to lock as Run() has exited)
for _, h := range c.runningScripts {
select {
case <-h.wait():
case <-deadline:
mErr.Errors = append(mErr.Errors, fmt.Errorf("timed out waiting for script checks to run"))
return mErr.ErrorOrNil()
}
}
return mErr.ErrorOrNil()
}
// makeAgentServiceID creates a unique ID for identifying an agent service in
// Consul.
//
// Agent service IDs are of the form:
//
// {nomadServicePrefix}-{ROLE}-{Service.Name}-{Service.Tags...}
// Example Server ID: _nomad-server-nomad-serf
// Example Client ID: _nomad-client-nomad-client-http
//
func makeAgentServiceID(role string, service *structs.Service) string {
parts := make([]string, len(service.Tags)+3)
parts[0] = nomadServicePrefix
parts[1] = role
parts[2] = service.Name
copy(parts[3:], service.Tags)
return strings.Join(parts, "-")
}
// makeTaskServiceID creates a unique ID for identifying a task service in
// Consul.
//
// Task service IDs are of the form:
//
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
//
func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
parts := make([]string, len(service.Tags)+5)
parts[0] = nomadServicePrefix
parts[1] = "executor"
parts[2] = allocID
parts[3] = taskName
parts[4] = service.Name
copy(parts[5:], service.Tags)
return strings.Join(parts, "-")
}
// createCheckID creates a unique ID for a check.
func createCheckID(serviceID string, check *structs.ServiceCheck) string {
return check.Hash(serviceID)
}
// createCheckReg creates a Check that can be registered with Consul.
//
// Only supports HTTP(S) and TCP checks. Script checks must be handled
// externally.
func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
chkReg := api.AgentCheckRegistration{
ID: checkID,
Name: check.Name,
ServiceID: serviceID,
}
chkReg.Status = check.InitialStatus
chkReg.Timeout = check.Timeout.String()
chkReg.Interval = check.Interval.String()
switch check.Type {
case structs.ServiceCheckHTTP:
if check.Protocol == "" {
check.Protocol = "http"
}
base := url.URL{
Scheme: check.Protocol,
Host: net.JoinHostPort(host, strconv.Itoa(port)),
}
relative, err := url.Parse(check.Path)
if err != nil {
return nil, err
}
url := base.ResolveReference(relative)
chkReg.HTTP = url.String()
case structs.ServiceCheckTCP:
chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
case structs.ServiceCheckScript:
chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
default:
return nil, fmt.Errorf("check type %+q not valid", check.Type)
}
return &chkReg, nil
}