2017-02-01 00:43:57 +00:00
|
|
|
package consul
|
|
|
|
|
|
|
|
import (
|
2017-08-26 05:40:18 +00:00
|
|
|
"context"
|
2017-02-01 00:43:57 +00:00
|
|
|
"fmt"
|
|
|
|
"net"
|
|
|
|
"net/url"
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
"reflect"
|
2017-02-01 00:43:57 +00:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
2017-07-24 19:12:02 +00:00
|
|
|
"sync/atomic"
|
2017-02-01 00:43:57 +00:00
|
|
|
"time"
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
"github.com/armon/go-metrics"
|
2018-09-13 17:43:40 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
2021-03-16 18:22:21 +00:00
|
|
|
"github.com/pkg/errors"
|
2018-09-13 17:43:40 +00:00
|
|
|
|
|
|
|
"github.com/hashicorp/consul/api"
|
2017-08-07 22:54:05 +00:00
|
|
|
"github.com/hashicorp/nomad/helper"
|
2017-02-01 00:43:57 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2019-01-04 23:01:35 +00:00
|
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
2017-02-01 00:43:57 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2017-12-08 01:08:25 +00:00
|
|
|
// nomadServicePrefix is the prefix that scopes all Nomad registered
|
|
|
|
// services (both agent and task entries).
|
2017-02-01 00:43:57 +00:00
|
|
|
nomadServicePrefix = "_nomad"
|
|
|
|
|
2017-12-08 01:08:25 +00:00
|
|
|
// nomadTaskPrefix is the prefix that scopes Nomad registered services
|
|
|
|
// for tasks.
|
|
|
|
nomadTaskPrefix = nomadServicePrefix + "-task-"
|
|
|
|
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
// nomadCheckPrefix is the prefix that scopes Nomad registered checks for
|
|
|
|
// services.
|
|
|
|
nomadCheckPrefix = nomadServicePrefix + "-check-"
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// defaultRetryInterval is how quickly to retry syncing services and
|
|
|
|
// checks to Consul when an error occurs. Will backoff up to a max.
|
|
|
|
defaultRetryInterval = time.Second
|
|
|
|
|
|
|
|
// defaultMaxRetryInterval is the default max retry interval.
|
|
|
|
defaultMaxRetryInterval = 30 * time.Second
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2018-04-17 19:36:50 +00:00
|
|
|
// defaultPeriodicalInterval is the interval at which the service
|
|
|
|
// client reconciles state between the desired services and checks and
|
|
|
|
// what's actually registered in Consul. This is done at an interval,
|
|
|
|
// rather than being purely edge triggered, to handle the case that the
|
|
|
|
// Consul agent's state may change underneath us
|
|
|
|
defaultPeriodicInterval = 30 * time.Second
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// ttlCheckBuffer is the time interval that Nomad can take to report Consul
|
|
|
|
// the check result
|
|
|
|
ttlCheckBuffer = 31 * time.Second
|
|
|
|
|
|
|
|
// defaultShutdownWait is how long Shutdown() should block waiting for
|
|
|
|
// enqueued operations to sync to Consul by default.
|
|
|
|
defaultShutdownWait = time.Minute
|
|
|
|
|
|
|
|
// DefaultQueryWaitDuration is the max duration the Consul Agent will
|
|
|
|
// spend waiting for a response from a Consul Query.
|
|
|
|
DefaultQueryWaitDuration = 2 * time.Second
|
|
|
|
|
|
|
|
// ServiceTagHTTP is the tag assigned to HTTP services
|
|
|
|
ServiceTagHTTP = "http"
|
|
|
|
|
|
|
|
// ServiceTagRPC is the tag assigned to RPC services
|
|
|
|
ServiceTagRPC = "rpc"
|
|
|
|
|
|
|
|
// ServiceTagSerf is the tag assigned to Serf services
|
|
|
|
ServiceTagSerf = "serf"
|
2019-06-14 14:57:46 +00:00
|
|
|
|
|
|
|
// deregisterProbationPeriod is the initialization period where
|
2019-07-17 03:43:13 +00:00
|
|
|
// services registered in Consul but not in Nomad don't get deregistered,
|
2019-06-14 14:57:46 +00:00
|
|
|
// to allow for nomad restoring tasks
|
2019-07-17 03:43:13 +00:00
|
|
|
deregisterProbationPeriod = time.Minute
|
2017-02-01 00:43:57 +00:00
|
|
|
)
|
|
|
|
|
2020-03-27 20:07:55 +00:00
|
|
|
// Additional Consul ACLs required
|
|
|
|
// - Consul Template: key:read
|
|
|
|
// Used in tasks with template stanza that use Consul keys.
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// CatalogAPI is the consul/api.Catalog API used by Nomad.
|
2020-03-27 20:07:55 +00:00
|
|
|
//
|
|
|
|
// ACL requirements
|
|
|
|
// - node:read (listing datacenters)
|
|
|
|
// - service:read
|
2017-02-01 00:43:57 +00:00
|
|
|
type CatalogAPI interface {
|
|
|
|
Datacenters() ([]string, error)
|
|
|
|
Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
|
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// NamespaceAPI is the consul/api.Namespace API used by Nomad.
|
|
|
|
//
|
|
|
|
// ACL requirements
|
|
|
|
// - operator:read OR namespace:*:read
|
|
|
|
type NamespaceAPI interface {
|
|
|
|
List(q *api.QueryOptions) ([]*api.Namespace, *api.QueryMeta, error)
|
|
|
|
}
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// AgentAPI is the consul/api.Agent API used by Nomad.
|
2020-03-27 20:07:55 +00:00
|
|
|
//
|
|
|
|
// ACL requirements
|
|
|
|
// - agent:read
|
|
|
|
// - service:write
|
2017-02-01 00:43:57 +00:00
|
|
|
type AgentAPI interface {
|
2021-03-16 18:22:21 +00:00
|
|
|
ServicesWithFilterOpts(filter string, q *api.QueryOptions) (map[string]*api.AgentService, error)
|
|
|
|
ChecksWithFilterOpts(filter string, q *api.QueryOptions) (map[string]*api.AgentCheck, error)
|
2017-02-01 00:43:57 +00:00
|
|
|
CheckRegister(check *api.AgentCheckRegistration) error
|
2021-03-16 18:22:21 +00:00
|
|
|
CheckDeregisterOpts(checkID string, q *api.QueryOptions) error
|
2018-03-15 00:37:54 +00:00
|
|
|
Self() (map[string]map[string]interface{}, error)
|
2017-02-01 00:43:57 +00:00
|
|
|
ServiceRegister(service *api.AgentServiceRegistration) error
|
2021-03-16 18:22:21 +00:00
|
|
|
ServiceDeregisterOpts(serviceID string, q *api.QueryOptions) error
|
|
|
|
UpdateTTLOpts(id, output, status string, q *api.QueryOptions) error
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2020-07-28 20:12:08 +00:00
|
|
|
// ConfigAPI is the consul/api.ConfigEntries API subset used by Nomad Server.
|
|
|
|
//
|
|
|
|
// ACL requirements
|
|
|
|
// - operator:write (server only)
|
|
|
|
type ConfigAPI interface {
|
|
|
|
Set(entry api.ConfigEntry, w *api.WriteOptions) (bool, *api.WriteMeta, error)
|
|
|
|
// Delete(kind, name string, w *api.WriteOptions) (*api.WriteMeta, error) (not used)
|
|
|
|
}
|
|
|
|
|
2019-12-19 23:40:30 +00:00
|
|
|
// ACLsAPI is the consul/api.ACL API subset used by Nomad Server.
|
2020-03-27 20:07:55 +00:00
|
|
|
//
|
|
|
|
// ACL requirements
|
|
|
|
// - acl:write (server only)
|
2019-12-06 20:46:46 +00:00
|
|
|
type ACLsAPI interface {
|
|
|
|
// We are looking up by [operator token] SecretID, which implies we need
|
|
|
|
// to use this method instead of the normal TokenRead, which can only be
|
|
|
|
// used to lookup tokens by their AccessorID.
|
|
|
|
TokenReadSelf(q *api.QueryOptions) (*api.ACLToken, *api.QueryMeta, error)
|
|
|
|
PolicyRead(policyID string, q *api.QueryOptions) (*api.ACLPolicy, *api.QueryMeta, error)
|
2019-12-19 23:40:30 +00:00
|
|
|
RoleRead(roleID string, q *api.QueryOptions) (*api.ACLRole, *api.QueryMeta, error)
|
2019-12-06 20:46:46 +00:00
|
|
|
TokenCreate(partial *api.ACLToken, q *api.WriteOptions) (*api.ACLToken, *api.WriteMeta, error)
|
|
|
|
TokenDelete(accessorID string, q *api.WriteOptions) (*api.WriteMeta, error)
|
|
|
|
TokenList(q *api.QueryOptions) ([]*api.ACLTokenListEntry, *api.QueryMeta, error)
|
|
|
|
}
|
|
|
|
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
// agentServiceUpdateRequired checks if any critical fields in Nomad's version
|
|
|
|
// of a service definition are different from the existing service definition as
|
|
|
|
// known by Consul.
|
2020-02-14 19:44:34 +00:00
|
|
|
//
|
|
|
|
// reason - The syncReason that triggered this synchronization with the consul
|
|
|
|
// agent API.
|
|
|
|
// wanted - Nomad's view of what the service definition is intended to be.
|
|
|
|
// Not nil.
|
|
|
|
// existing - Consul's view (agent, not catalog) of the actual service definition.
|
|
|
|
// Not nil.
|
|
|
|
// sidecar - Consul's view (agent, not catalog) of the service definition of the sidecar
|
|
|
|
// associated with existing that may or may not exist.
|
|
|
|
// May be nil.
|
|
|
|
func agentServiceUpdateRequired(reason syncReason, wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool {
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
switch reason {
|
|
|
|
case syncPeriodic:
|
|
|
|
// In a periodic sync with Consul, we need to respect the value of
|
|
|
|
// the enable_tag_override field so that we maintain the illusion that the
|
|
|
|
// user is in control of the Consul tags, as they may be externally edited
|
|
|
|
// via the Consul catalog API (e.g. a user manually sets them).
|
|
|
|
//
|
|
|
|
// As Consul does by disabling anti-entropy for the tags field, Nomad will
|
|
|
|
// ignore differences in the tags field during the periodic syncs with
|
|
|
|
// the Consul agent API.
|
|
|
|
//
|
|
|
|
// We do so by over-writing the nomad service registration by the value
|
|
|
|
// of the tags that Consul contains, if enable_tag_override = true.
|
2020-02-14 19:44:34 +00:00
|
|
|
maybeTweakTags(wanted, existing, sidecar)
|
|
|
|
return different(wanted, existing, sidecar)
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
|
|
|
|
default:
|
|
|
|
// A non-periodic sync with Consul indicates an operation has been set
|
|
|
|
// on the queue. This happens when service has been added / removed / modified
|
|
|
|
// and implies the Consul agent should be sync'd with nomad, because
|
|
|
|
// nomad is the ultimate source of truth for the service definition.
|
2020-02-14 19:44:34 +00:00
|
|
|
return different(wanted, existing, sidecar)
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// maybeTweakTags will override wanted.Tags with a copy of existing.Tags only if
|
|
|
|
// EnableTagOverride is true. Otherwise the wanted service registration is left
|
|
|
|
// unchanged.
|
2020-02-14 19:44:34 +00:00
|
|
|
func maybeTweakTags(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) {
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
if wanted.EnableTagOverride {
|
|
|
|
wanted.Tags = helper.CopySliceString(existing.Tags)
|
2020-02-14 19:44:34 +00:00
|
|
|
// If the service registration also defines a sidecar service, use the ETO
|
|
|
|
// setting for the parent service to also apply to the sidecar.
|
|
|
|
if wanted.Connect != nil && wanted.Connect.SidecarService != nil {
|
|
|
|
if sidecar != nil {
|
|
|
|
wanted.Connect.SidecarService.Tags = helper.CopySliceString(sidecar.Tags)
|
|
|
|
}
|
|
|
|
}
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// different compares the wanted state of the service registration with the actual
|
|
|
|
// (cached) state of the service registration reported by Consul. If any of the
|
|
|
|
// critical fields are not deeply equal, they considered different.
|
2020-02-14 19:44:34 +00:00
|
|
|
func different(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool {
|
2020-11-11 22:43:14 +00:00
|
|
|
switch {
|
|
|
|
case wanted.Kind != existing.Kind:
|
|
|
|
return true
|
|
|
|
case wanted.ID != existing.ID:
|
|
|
|
return true
|
|
|
|
case wanted.Port != existing.Port:
|
|
|
|
return true
|
|
|
|
case wanted.Address != existing.Address:
|
|
|
|
return true
|
|
|
|
case wanted.Name != existing.Service:
|
|
|
|
return true
|
|
|
|
case wanted.EnableTagOverride != existing.EnableTagOverride:
|
|
|
|
return true
|
|
|
|
case !reflect.DeepEqual(wanted.Meta, existing.Meta):
|
|
|
|
return true
|
2021-02-16 18:44:41 +00:00
|
|
|
case tagsDifferent(wanted.Tags, existing.Tags):
|
2020-11-11 22:43:14 +00:00
|
|
|
return true
|
|
|
|
case connectSidecarDifferent(wanted, sidecar):
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func tagsDifferent(a, b []string) bool {
|
|
|
|
if len(a) != len(b) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, valueA := range a {
|
|
|
|
if b[i] != valueA {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
2020-02-14 19:44:34 +00:00
|
|
|
}
|
|
|
|
|
2021-02-16 18:44:41 +00:00
|
|
|
// sidecarTagsDifferent includes the special logic for comparing sidecar tags
|
|
|
|
// from Nomad vs. Consul perspective. Because Consul forces the sidecar tags
|
|
|
|
// to inherit the parent service tags if the sidecar tags are unset, we need to
|
|
|
|
// take that into consideration when Nomad's sidecar tags are unset by instead
|
|
|
|
// comparing them to the parent service tags.
|
|
|
|
func sidecarTagsDifferent(parent, wanted, sidecar []string) bool {
|
|
|
|
if len(wanted) == 0 {
|
|
|
|
return tagsDifferent(parent, sidecar)
|
|
|
|
}
|
|
|
|
return tagsDifferent(wanted, sidecar)
|
|
|
|
}
|
|
|
|
|
|
|
|
// connectSidecarDifferent returns true if Nomad expects there to be a sidecar
|
|
|
|
// hanging off the desired parent service definition on the Consul side, and does
|
|
|
|
// not match with what Consul has.
|
2020-02-14 19:44:34 +00:00
|
|
|
func connectSidecarDifferent(wanted *api.AgentServiceRegistration, sidecar *api.AgentService) bool {
|
|
|
|
if wanted.Connect != nil && wanted.Connect.SidecarService != nil {
|
|
|
|
if sidecar == nil {
|
|
|
|
// consul lost our sidecar (?)
|
|
|
|
return true
|
|
|
|
}
|
2021-02-16 18:44:41 +00:00
|
|
|
|
|
|
|
if sidecarTagsDifferent(wanted.Tags, wanted.Connect.SidecarService.Tags, sidecar.Tags) {
|
2020-02-14 19:44:34 +00:00
|
|
|
// tags on the nomad definition have been modified
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-16 18:44:41 +00:00
|
|
|
// Either Nomad does not expect there to be a sidecar_service, or there is
|
|
|
|
// no actionable difference from the Consul sidecar_service definition.
|
2020-02-14 19:44:34 +00:00
|
|
|
return false
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// operations are submitted to the main loop via commit() for synchronizing
|
|
|
|
// with Consul.
|
|
|
|
type operations struct {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
regServices []*api.AgentServiceRegistration
|
|
|
|
regChecks []*api.AgentCheckRegistration
|
2017-04-08 00:10:26 +00:00
|
|
|
deregServices []string
|
|
|
|
deregChecks []string
|
|
|
|
}
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// AllocRegistration holds the status of services registered for a particular
|
|
|
|
// allocations by task.
|
|
|
|
type AllocRegistration struct {
|
|
|
|
// Tasks maps the name of a task to its registered services and checks
|
2019-11-18 18:04:01 +00:00
|
|
|
Tasks map[string]*ServiceRegistrations
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
|
2017-08-10 20:07:03 +00:00
|
|
|
func (a *AllocRegistration) copy() *AllocRegistration {
|
2017-08-07 22:54:05 +00:00
|
|
|
c := &AllocRegistration{
|
2019-11-18 18:04:01 +00:00
|
|
|
Tasks: make(map[string]*ServiceRegistrations, len(a.Tasks)),
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for k, v := range a.Tasks {
|
2017-08-10 20:07:03 +00:00
|
|
|
c.Tasks[k] = v.copy()
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
// NumServices returns the number of registered services
|
|
|
|
func (a *AllocRegistration) NumServices() int {
|
|
|
|
if a == nil {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
total := 0
|
|
|
|
for _, treg := range a.Tasks {
|
|
|
|
for _, sreg := range treg.Services {
|
|
|
|
if sreg.Service != nil {
|
|
|
|
total++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return total
|
|
|
|
}
|
|
|
|
|
|
|
|
// NumChecks returns the number of registered checks
|
|
|
|
func (a *AllocRegistration) NumChecks() int {
|
|
|
|
if a == nil {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
total := 0
|
|
|
|
for _, treg := range a.Tasks {
|
|
|
|
for _, sreg := range treg.Services {
|
|
|
|
total += len(sreg.Checks)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return total
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// ServiceRegistrations holds the status of services registered for a particular
|
|
|
|
// task or task group.
|
|
|
|
type ServiceRegistrations struct {
|
2017-08-07 22:54:05 +00:00
|
|
|
Services map[string]*ServiceRegistration
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
func (t *ServiceRegistrations) copy() *ServiceRegistrations {
|
|
|
|
c := &ServiceRegistrations{
|
2017-08-07 22:54:05 +00:00
|
|
|
Services: make(map[string]*ServiceRegistration, len(t.Services)),
|
|
|
|
}
|
|
|
|
|
|
|
|
for k, v := range t.Services {
|
2017-08-10 20:07:03 +00:00
|
|
|
c.Services[k] = v.copy()
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
// ServiceRegistration holds the status of a registered Consul Service and its
|
|
|
|
// Checks.
|
|
|
|
type ServiceRegistration struct {
|
|
|
|
// serviceID and checkIDs are internal fields that track just the IDs of the
|
|
|
|
// services/checks registered in Consul. It is used to materialize the other
|
|
|
|
// fields when queried.
|
|
|
|
serviceID string
|
|
|
|
checkIDs map[string]struct{}
|
|
|
|
|
2021-01-22 19:45:26 +00:00
|
|
|
// CheckOnUpdate is a map of checkIDs and the associated OnUpdate value
|
|
|
|
// from the ServiceCheck It is used to determine how a reported checks
|
|
|
|
// status should be evaluated.
|
|
|
|
CheckOnUpdate map[string]string
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// Service is the AgentService registered in Consul.
|
|
|
|
Service *api.AgentService
|
|
|
|
|
|
|
|
// Checks is the status of the registered checks.
|
|
|
|
Checks []*api.AgentCheck
|
|
|
|
}
|
|
|
|
|
2017-08-10 20:07:03 +00:00
|
|
|
func (s *ServiceRegistration) copy() *ServiceRegistration {
|
|
|
|
// Copy does not copy the external fields but only the internal fields. This
|
|
|
|
// is so that the caller of AllocRegistrations can not access the internal
|
|
|
|
// fields and that method uses these fields to populate the external fields.
|
2017-08-07 22:54:05 +00:00
|
|
|
return &ServiceRegistration{
|
2021-01-22 19:45:26 +00:00
|
|
|
serviceID: s.serviceID,
|
|
|
|
checkIDs: helper.CopyMapStringStruct(s.checkIDs),
|
|
|
|
CheckOnUpdate: helper.CopyMapStringString(s.CheckOnUpdate),
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// ServiceClient handles task and agent service registration with Consul.
|
|
|
|
type ServiceClient struct {
|
2021-03-16 18:22:21 +00:00
|
|
|
agentAPI AgentAPI
|
|
|
|
namespacesClient *NamespacesClient
|
|
|
|
|
2018-09-13 17:43:40 +00:00
|
|
|
logger log.Logger
|
2017-04-08 00:10:26 +00:00
|
|
|
retryInterval time.Duration
|
|
|
|
maxRetryInterval time.Duration
|
2018-04-17 19:36:50 +00:00
|
|
|
periodicInterval time.Duration
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// exitCh is closed when the main Run loop exits
|
|
|
|
exitCh chan struct{}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
|
|
|
// shutdownCh is closed when the client should shutdown
|
|
|
|
shutdownCh chan struct{}
|
|
|
|
|
|
|
|
// shutdownWait is how long Shutdown() blocks waiting for the final
|
|
|
|
// sync() to finish. Defaults to defaultShutdownWait
|
|
|
|
shutdownWait time.Duration
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
opCh chan *operations
|
2017-02-01 00:43:57 +00:00
|
|
|
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
services map[string]*api.AgentServiceRegistration
|
|
|
|
checks map[string]*api.AgentCheckRegistration
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2019-06-14 14:57:46 +00:00
|
|
|
explicitlyDeregisteredServices map[string]bool
|
|
|
|
explicitlyDeregisteredChecks map[string]bool
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// allocRegistrations stores the services and checks that are registered
|
|
|
|
// with Consul by allocation ID.
|
|
|
|
allocRegistrations map[string]*AllocRegistration
|
|
|
|
allocRegistrationsLock sync.RWMutex
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Nomad agent services and checks that are recorded so they can be removed
|
|
|
|
// on shutdown. Defers to consul namespace specified in client consul config.
|
2017-02-01 00:43:57 +00:00
|
|
|
agentServices map[string]struct{}
|
|
|
|
agentChecks map[string]struct{}
|
2017-04-08 00:10:26 +00:00
|
|
|
agentLock sync.Mutex
|
2017-07-24 19:12:02 +00:00
|
|
|
|
2018-03-11 18:34:27 +00:00
|
|
|
// seen is 1 if Consul has ever been seen; otherwise 0. Accessed with
|
2017-07-24 19:12:02 +00:00
|
|
|
// atomics.
|
2017-08-04 17:14:16 +00:00
|
|
|
seen int32
|
2017-08-26 05:40:18 +00:00
|
|
|
|
2019-06-14 14:57:46 +00:00
|
|
|
// deregisterProbationExpiry is the time before which consul sync shouldn't deregister
|
|
|
|
// unknown services.
|
|
|
|
// Used to mitigate risk of deleting restored services upon client restart.
|
|
|
|
deregisterProbationExpiry time.Time
|
|
|
|
|
2017-08-26 05:40:18 +00:00
|
|
|
// checkWatcher restarts checks that are unhealthy.
|
|
|
|
checkWatcher *checkWatcher
|
2018-06-01 19:48:25 +00:00
|
|
|
|
2018-06-01 20:59:53 +00:00
|
|
|
// isClientAgent specifies whether this Consul client is being used
|
|
|
|
// by a Nomad client.
|
2018-06-01 19:48:25 +00:00
|
|
|
isClientAgent bool
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewServiceClient creates a new Consul ServiceClient from an existing Consul API
|
2018-06-01 20:59:53 +00:00
|
|
|
// Client, logger and takes whether the client is being used by a Nomad Client agent.
|
|
|
|
// When being used by a Nomad client, this Consul client reconciles all services and
|
|
|
|
// checks created by Nomad on behalf of running tasks.
|
2021-03-16 18:22:21 +00:00
|
|
|
func NewServiceClient(agentAPI AgentAPI, namespacesClient *NamespacesClient, logger log.Logger, isNomadClient bool) *ServiceClient {
|
2018-09-13 17:43:40 +00:00
|
|
|
logger = logger.ResetNamed("consul.sync")
|
2017-02-01 00:43:57 +00:00
|
|
|
return &ServiceClient{
|
2021-03-16 18:22:21 +00:00
|
|
|
agentAPI: agentAPI,
|
|
|
|
namespacesClient: namespacesClient,
|
2019-06-14 14:57:46 +00:00
|
|
|
logger: logger,
|
|
|
|
retryInterval: defaultRetryInterval,
|
|
|
|
maxRetryInterval: defaultMaxRetryInterval,
|
|
|
|
periodicInterval: defaultPeriodicInterval,
|
|
|
|
exitCh: make(chan struct{}),
|
|
|
|
shutdownCh: make(chan struct{}),
|
|
|
|
shutdownWait: defaultShutdownWait,
|
|
|
|
opCh: make(chan *operations, 8),
|
|
|
|
services: make(map[string]*api.AgentServiceRegistration),
|
|
|
|
checks: make(map[string]*api.AgentCheckRegistration),
|
|
|
|
explicitlyDeregisteredServices: make(map[string]bool),
|
|
|
|
explicitlyDeregisteredChecks: make(map[string]bool),
|
|
|
|
allocRegistrations: make(map[string]*AllocRegistration),
|
|
|
|
agentServices: make(map[string]struct{}),
|
|
|
|
agentChecks: make(map[string]struct{}),
|
2021-03-16 18:22:21 +00:00
|
|
|
checkWatcher: newCheckWatcher(logger, agentAPI, namespacesClient),
|
2019-06-14 14:57:46 +00:00
|
|
|
isClientAgent: isNomadClient,
|
|
|
|
deregisterProbationExpiry: time.Now().Add(deregisterProbationPeriod),
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-25 19:13:05 +00:00
|
|
|
// seen is used by markSeen and hasSeen
|
2017-07-24 19:12:02 +00:00
|
|
|
const seen = 1
|
|
|
|
|
2017-07-24 23:48:40 +00:00
|
|
|
// markSeen marks Consul as having been seen (meaning at least one operation
|
2017-07-24 19:12:02 +00:00
|
|
|
// has succeeded).
|
2017-07-24 23:48:40 +00:00
|
|
|
func (c *ServiceClient) markSeen() {
|
2017-08-04 17:14:16 +00:00
|
|
|
atomic.StoreInt32(&c.seen, seen)
|
2017-07-24 19:12:02 +00:00
|
|
|
}
|
|
|
|
|
2017-07-24 23:48:40 +00:00
|
|
|
// hasSeen returns true if any Consul operation has ever succeeded. Useful to
|
2017-07-24 19:12:02 +00:00
|
|
|
// squelch errors if Consul isn't running.
|
2017-07-24 23:48:40 +00:00
|
|
|
func (c *ServiceClient) hasSeen() bool {
|
2017-08-04 17:14:16 +00:00
|
|
|
return atomic.LoadInt32(&c.seen) == seen
|
2017-07-24 19:12:02 +00:00
|
|
|
}
|
|
|
|
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
// syncReason indicates why a sync operation with consul is about to happen.
|
|
|
|
//
|
|
|
|
// The trigger for a sync may have implications on the behavior of the sync itself.
|
2020-02-14 19:44:34 +00:00
|
|
|
// In particular if a service is defined with enable_tag_override=true, the sync
|
|
|
|
// should ignore changes to the service's Tags field.
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
type syncReason byte
|
|
|
|
|
|
|
|
const (
|
|
|
|
syncPeriodic = iota
|
|
|
|
syncShutdown
|
|
|
|
syncNewOps
|
|
|
|
)
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// Run the Consul main loop which retries operations against Consul. It should
|
|
|
|
// be called exactly once.
|
|
|
|
func (c *ServiceClient) Run() {
|
2017-04-08 00:10:26 +00:00
|
|
|
defer close(c.exitCh)
|
2017-08-26 05:40:18 +00:00
|
|
|
|
2018-03-15 00:37:54 +00:00
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
// init will be closed when Consul has been contacted
|
|
|
|
init := make(chan struct{})
|
2021-03-16 18:22:21 +00:00
|
|
|
go checkConsulTLSSkipVerify(ctx, c.logger, c.agentAPI, init)
|
2018-03-15 00:37:54 +00:00
|
|
|
|
|
|
|
// Process operations while waiting for initial contact with Consul but
|
|
|
|
// do not sync until contact has been made.
|
|
|
|
INIT:
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-init:
|
|
|
|
c.markSeen()
|
|
|
|
break INIT
|
|
|
|
case <-c.shutdownCh:
|
|
|
|
return
|
|
|
|
case ops := <-c.opCh:
|
|
|
|
c.merge(ops)
|
|
|
|
}
|
|
|
|
}
|
2018-09-13 17:43:40 +00:00
|
|
|
c.logger.Trace("able to contact Consul")
|
2018-03-15 00:37:54 +00:00
|
|
|
|
|
|
|
// Block until contact with Consul has been established
|
|
|
|
// Start checkWatcher
|
2017-08-26 05:40:18 +00:00
|
|
|
go c.checkWatcher.Run(ctx)
|
|
|
|
|
2018-04-17 19:36:50 +00:00
|
|
|
// Always immediately sync to reconcile Nomad and Consul's state
|
2017-04-08 00:10:26 +00:00
|
|
|
retryTimer := time.NewTimer(0)
|
2018-03-15 00:37:54 +00:00
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
failures := 0
|
|
|
|
for {
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
// On every iteration take note of what the trigger for the next sync
|
|
|
|
// was, so that it may be referenced during the sync itself.
|
|
|
|
var reasonForSync syncReason
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
select {
|
|
|
|
case <-retryTimer.C:
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
reasonForSync = syncPeriodic
|
2017-04-08 00:10:26 +00:00
|
|
|
case <-c.shutdownCh:
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
reasonForSync = syncShutdown
|
2018-03-15 00:37:54 +00:00
|
|
|
// Cancel check watcher but sync one last time
|
|
|
|
cancel()
|
2017-04-08 00:10:26 +00:00
|
|
|
case ops := <-c.opCh:
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
reasonForSync = syncNewOps
|
2017-04-08 00:10:26 +00:00
|
|
|
c.merge(ops)
|
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
if err := c.sync(reasonForSync); err != nil {
|
2017-07-24 23:48:40 +00:00
|
|
|
if failures == 0 {
|
2017-12-08 01:08:25 +00:00
|
|
|
// Log on the first failure
|
2018-09-13 17:43:40 +00:00
|
|
|
c.logger.Warn("failed to update services in Consul", "error", err)
|
2017-12-08 01:08:25 +00:00
|
|
|
} else if failures%10 == 0 {
|
|
|
|
// Log every 10th consecutive failure
|
2018-09-13 17:43:40 +00:00
|
|
|
c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err)
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2017-12-08 01:08:25 +00:00
|
|
|
|
2017-07-24 22:37:53 +00:00
|
|
|
failures++
|
2017-04-08 00:10:26 +00:00
|
|
|
if !retryTimer.Stop() {
|
2017-04-18 23:36:20 +00:00
|
|
|
// Timer already expired, since the timer may
|
|
|
|
// or may not have been read in the select{}
|
|
|
|
// above, conditionally receive on it
|
2017-04-12 19:07:10 +00:00
|
|
|
select {
|
|
|
|
case <-retryTimer.C:
|
|
|
|
default:
|
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
|
|
|
backoff := c.retryInterval * time.Duration(failures)
|
|
|
|
if backoff > c.maxRetryInterval {
|
|
|
|
backoff = c.maxRetryInterval
|
|
|
|
}
|
|
|
|
retryTimer.Reset(backoff)
|
|
|
|
} else {
|
|
|
|
if failures > 0 {
|
2018-09-13 17:43:40 +00:00
|
|
|
c.logger.Info("successfully updated services in Consul")
|
2017-04-08 00:10:26 +00:00
|
|
|
failures = 0
|
|
|
|
}
|
2018-04-17 19:36:50 +00:00
|
|
|
|
2019-06-14 14:57:46 +00:00
|
|
|
// on successful sync, clear deregistered consul entities
|
|
|
|
c.clearExplicitlyDeregistered()
|
|
|
|
|
2018-04-17 19:36:50 +00:00
|
|
|
// Reset timer to periodic interval to periodically
|
|
|
|
// reconile with Consul
|
|
|
|
if !retryTimer.Stop() {
|
|
|
|
select {
|
|
|
|
case <-retryTimer.C:
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
retryTimer.Reset(c.periodicInterval)
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
|
|
|
select {
|
2017-04-08 00:10:26 +00:00
|
|
|
case <-c.shutdownCh:
|
|
|
|
// Exit only after sync'ing all outstanding operations
|
|
|
|
if len(c.opCh) > 0 {
|
|
|
|
for len(c.opCh) > 0 {
|
|
|
|
c.merge(<-c.opCh)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
continue
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
return
|
2017-04-08 00:10:26 +00:00
|
|
|
default:
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-18 00:07:42 +00:00
|
|
|
// commit operations unless already shutting down.
|
|
|
|
func (c *ServiceClient) commit(ops *operations) {
|
2017-02-01 00:43:57 +00:00
|
|
|
select {
|
2017-04-08 00:10:26 +00:00
|
|
|
case c.opCh <- ops:
|
|
|
|
case <-c.shutdownCh:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-14 14:57:46 +00:00
|
|
|
func (c *ServiceClient) clearExplicitlyDeregistered() {
|
2021-03-16 18:22:21 +00:00
|
|
|
c.explicitlyDeregisteredServices = make(map[string]bool)
|
|
|
|
c.explicitlyDeregisteredChecks = make(map[string]bool)
|
2019-06-14 14:57:46 +00:00
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// merge registrations into state map prior to sync'ing with Consul
|
|
|
|
func (c *ServiceClient) merge(ops *operations) {
|
|
|
|
for _, s := range ops.regServices {
|
|
|
|
c.services[s.ID] = s
|
|
|
|
}
|
|
|
|
for _, check := range ops.regChecks {
|
|
|
|
c.checks[check.ID] = check
|
|
|
|
}
|
|
|
|
for _, sid := range ops.deregServices {
|
|
|
|
delete(c.services, sid)
|
2019-06-14 14:57:46 +00:00
|
|
|
c.explicitlyDeregisteredServices[sid] = true
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
|
|
|
for _, cid := range ops.deregChecks {
|
|
|
|
delete(c.checks, cid)
|
2019-06-14 14:57:46 +00:00
|
|
|
c.explicitlyDeregisteredChecks[cid] = true
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
|
|
|
|
metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// sync enqueued operations.
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
func (c *ServiceClient) sync(reason syncReason) error {
|
2017-04-08 00:10:26 +00:00
|
|
|
sreg, creg, sdereg, cdereg := 0, 0, 0, 0
|
2021-03-16 18:22:21 +00:00
|
|
|
var err error
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Get the list of all namespaces created so we can iterate them.
|
|
|
|
namespaces, err := c.namespacesClient.List()
|
2017-04-08 00:10:26 +00:00
|
|
|
if err != nil {
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
2021-03-16 18:22:21 +00:00
|
|
|
return errors.Wrap(err, "failed to query Consul namespaces")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Accumulate all services in Consul across all namespaces.
|
|
|
|
servicesInConsul := make(map[string]*api.AgentService)
|
|
|
|
for _, namespace := range namespaces {
|
|
|
|
if nsServices, err := c.agentAPI.ServicesWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}); err != nil {
|
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
|
|
|
return errors.Wrap(err, "failed to query Consul services")
|
|
|
|
} else {
|
|
|
|
for k, v := range nsServices {
|
|
|
|
servicesInConsul[k] = v
|
|
|
|
}
|
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Compute whether we are still in probation period where we will avoid
|
|
|
|
// de-registering services.
|
2019-06-14 14:57:46 +00:00
|
|
|
inProbation := time.Now().Before(c.deregisterProbationExpiry)
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Remove Nomad services in Consul but unknown to Nomad.
|
|
|
|
for id := range servicesInConsul {
|
2017-04-08 00:10:26 +00:00
|
|
|
if _, ok := c.services[id]; ok {
|
|
|
|
// Known service, skip
|
|
|
|
continue
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2018-06-01 20:59:53 +00:00
|
|
|
|
|
|
|
// Ignore if this is not a Nomad managed service. Also ignore
|
|
|
|
// Nomad managed services if this is not a client agent.
|
|
|
|
// This is to prevent server agents from removing services
|
|
|
|
// registered by client agents
|
|
|
|
if !isNomadService(id) || !c.isClientAgent {
|
2017-04-08 00:10:26 +00:00
|
|
|
// Not managed by Nomad, skip
|
|
|
|
continue
|
|
|
|
}
|
2017-12-08 01:08:25 +00:00
|
|
|
|
2019-06-14 14:57:46 +00:00
|
|
|
// Ignore unknown services during probation
|
|
|
|
if inProbation && !c.explicitlyDeregisteredServices[id] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2019-08-30 18:05:30 +00:00
|
|
|
// Ignore if this is a service for a Nomad managed sidecar proxy.
|
|
|
|
if isNomadSidecar(id, c.services) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// Unknown Nomad managed service; kill
|
2021-03-16 18:22:21 +00:00
|
|
|
ns := servicesInConsul[id].Namespace
|
|
|
|
if err := c.agentAPI.ServiceDeregisterOpts(id, &api.QueryOptions{Namespace: ns}); err != nil {
|
2017-12-08 01:08:25 +00:00
|
|
|
if isOldNomadService(id) {
|
|
|
|
// Don't hard-fail on old entries. See #3620
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
2017-04-08 00:10:26 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
sdereg++
|
2017-12-01 14:24:14 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
// Add Nomad services missing from Consul, or where the service has been updated.
|
2020-02-14 19:44:34 +00:00
|
|
|
for id, serviceInNomad := range c.services {
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
serviceInConsul, exists := servicesInConsul[id]
|
|
|
|
sidecarInConsul := getNomadSidecar(id, servicesInConsul)
|
2020-02-14 19:44:34 +00:00
|
|
|
|
|
|
|
if !exists || agentServiceUpdateRequired(reason, serviceInNomad, serviceInConsul, sidecarInConsul) {
|
2021-03-16 18:22:21 +00:00
|
|
|
if err = c.agentAPI.ServiceRegister(serviceInNomad); err != nil {
|
2020-02-14 19:44:34 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
|
|
|
return err
|
2017-04-18 04:15:13 +00:00
|
|
|
}
|
2020-02-14 19:44:34 +00:00
|
|
|
sreg++
|
|
|
|
metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
checksInConsul := make(map[string]*api.AgentCheck)
|
|
|
|
for _, namespace := range namespaces {
|
|
|
|
nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)})
|
|
|
|
if err != nil {
|
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
|
|
|
return errors.Wrap(err, "failed to query Consul checks")
|
|
|
|
}
|
|
|
|
for k, v := range nsChecks {
|
|
|
|
checksInConsul[k] = v
|
|
|
|
}
|
2020-10-06 00:30:29 +00:00
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// Remove Nomad checks in Consul but unknown locally
|
2021-03-16 18:22:21 +00:00
|
|
|
for id, check := range checksInConsul {
|
2017-04-08 00:10:26 +00:00
|
|
|
if _, ok := c.checks[id]; ok {
|
2017-04-18 04:15:13 +00:00
|
|
|
// Known check, leave it
|
2017-04-08 00:10:26 +00:00
|
|
|
continue
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2018-06-01 20:59:53 +00:00
|
|
|
|
|
|
|
// Ignore if this is not a Nomad managed check. Also ignore
|
|
|
|
// Nomad managed checks if this is not a client agent.
|
|
|
|
// This is to prevent server agents from removing checks
|
|
|
|
// registered by client agents
|
2019-04-25 11:48:19 +00:00
|
|
|
if !isNomadService(check.ServiceID) || !c.isClientAgent || !isNomadCheck(check.CheckID) {
|
2017-06-16 23:35:16 +00:00
|
|
|
// Service not managed by Nomad, skip
|
2017-04-08 00:10:26 +00:00
|
|
|
continue
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-12-08 01:08:25 +00:00
|
|
|
|
2019-06-14 14:57:46 +00:00
|
|
|
// Ignore unknown services during probation
|
|
|
|
if inProbation && !c.explicitlyDeregisteredChecks[id] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2019-08-30 18:05:30 +00:00
|
|
|
// Ignore if this is a check for a Nomad managed sidecar proxy.
|
|
|
|
if isNomadSidecar(check.ServiceID, c.services) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-12-08 01:08:25 +00:00
|
|
|
// Unknown Nomad managed check; remove
|
2021-03-16 18:22:21 +00:00
|
|
|
if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}); err != nil {
|
2017-12-08 01:08:25 +00:00
|
|
|
if isOldNomadService(check.ServiceID) {
|
|
|
|
// Don't hard-fail on old entries.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
2017-04-08 00:10:26 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
cdereg++
|
2017-12-01 14:24:14 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// Add Nomad checks missing from Consul
|
|
|
|
for id, check := range c.checks {
|
2021-03-16 18:22:21 +00:00
|
|
|
if _, ok := checksInConsul[id]; ok {
|
2017-12-08 01:08:25 +00:00
|
|
|
// Already in Consul; skipping
|
|
|
|
continue
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2021-03-16 18:22:21 +00:00
|
|
|
if err := c.agentAPI.CheckRegister(check); err != nil {
|
2017-04-18 23:23:39 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
2017-04-08 00:10:26 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
creg++
|
2017-12-01 14:24:14 +00:00
|
|
|
metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2019-02-02 20:18:30 +00:00
|
|
|
// Only log if something was actually synced
|
|
|
|
if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 {
|
|
|
|
c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg,
|
|
|
|
"registered_checks", creg, "deregistered_checks", cdereg)
|
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-04-13 23:59:27 +00:00
|
|
|
// RegisterAgent registers Nomad agents (client or server). The
|
|
|
|
// Service.PortLabel should be a literal port to be parsed with SplitHostPort.
|
|
|
|
// Script checks are not supported and will return an error. Registration is
|
|
|
|
// asynchronous.
|
2017-02-01 00:43:57 +00:00
|
|
|
//
|
|
|
|
// Agents will be deregistered when Shutdown is called.
|
2021-04-05 15:45:55 +00:00
|
|
|
//
|
|
|
|
// Note: no need to manually plumb Consul namespace into the agent service registration
|
|
|
|
// or its check registrations, because the Nomad Client's Consul Client will already
|
|
|
|
// have the Nomad Client's Consul Namespace set on startup.
|
2017-02-01 00:43:57 +00:00
|
|
|
func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
|
2017-04-08 00:10:26 +00:00
|
|
|
ops := operations{}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2017-04-04 00:08:08 +00:00
|
|
|
for _, service := range services {
|
2017-02-01 00:43:57 +00:00
|
|
|
id := makeAgentServiceID(role, service)
|
2017-04-13 23:59:27 +00:00
|
|
|
|
|
|
|
// Unlike tasks, agents don't use port labels. Agent ports are
|
|
|
|
// stored directly in the PortLabel.
|
2017-02-01 00:43:57 +00:00
|
|
|
host, rawport, err := net.SplitHostPort(service.PortLabel)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
|
|
|
|
}
|
|
|
|
port, err := strconv.Atoi(rawport)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
|
|
|
|
}
|
|
|
|
serviceReg := &api.AgentServiceRegistration{
|
|
|
|
ID: id,
|
|
|
|
Name: service.Name,
|
|
|
|
Tags: service.Tags,
|
|
|
|
Address: host,
|
|
|
|
Port: port,
|
2018-11-16 17:28:56 +00:00
|
|
|
// This enables the consul UI to show that Nomad registered this service
|
|
|
|
Meta: map[string]string{
|
|
|
|
"external-source": "nomad",
|
|
|
|
},
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
ops.regServices = append(ops.regServices, serviceReg)
|
2017-02-01 00:43:57 +00:00
|
|
|
|
|
|
|
for _, check := range service.Checks {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
checkID := MakeCheckID(id, check)
|
2017-02-01 00:43:57 +00:00
|
|
|
if check.Type == structs.ServiceCheckScript {
|
|
|
|
return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
|
|
|
|
}
|
|
|
|
checkHost, checkPort := serviceReg.Address, serviceReg.Port
|
|
|
|
if check.PortLabel != "" {
|
2017-04-13 23:59:27 +00:00
|
|
|
// Unlike tasks, agents don't use port labels. Agent ports are
|
|
|
|
// stored directly in the PortLabel.
|
2017-02-01 00:43:57 +00:00
|
|
|
host, rawport, err := net.SplitHostPort(check.PortLabel)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
|
|
|
|
}
|
|
|
|
port, err := strconv.Atoi(rawport)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
|
|
|
|
}
|
|
|
|
checkHost, checkPort = host, port
|
|
|
|
}
|
2021-04-05 15:45:55 +00:00
|
|
|
checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort, "")
|
2017-02-01 00:43:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to add check %q: %v", check.Name, err)
|
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
ops.regChecks = append(ops.regChecks, checkReg)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-18 00:07:42 +00:00
|
|
|
// Don't bother committing agent checks if we're already shutting down
|
|
|
|
c.agentLock.Lock()
|
|
|
|
defer c.agentLock.Unlock()
|
|
|
|
select {
|
|
|
|
case <-c.shutdownCh:
|
2017-04-08 00:10:26 +00:00
|
|
|
return nil
|
2017-04-18 00:07:42 +00:00
|
|
|
default:
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2017-04-18 00:07:42 +00:00
|
|
|
// Now add them to the registration queue
|
|
|
|
c.commit(&ops)
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// Record IDs for deregistering on shutdown
|
2017-04-08 00:10:26 +00:00
|
|
|
for _, id := range ops.regServices {
|
2017-04-13 20:49:23 +00:00
|
|
|
c.agentServices[id.ID] = struct{}{}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
for _, id := range ops.regChecks {
|
2017-04-13 20:49:23 +00:00
|
|
|
c.agentChecks[id.ID] = struct{}{}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-04 00:08:08 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// serviceRegs creates service registrations, check registrations, and script
|
2017-08-07 22:54:05 +00:00
|
|
|
// checks from a service. It returns a service registration object with the
|
|
|
|
// service and check IDs populated.
|
2019-11-18 18:04:01 +00:00
|
|
|
func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, workload *WorkloadServices) (
|
2018-04-23 23:34:53 +00:00
|
|
|
*ServiceRegistration, error) {
|
2017-04-04 00:08:08 +00:00
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// Get the services ID
|
2019-11-18 18:04:01 +00:00
|
|
|
id := MakeAllocServiceID(workload.AllocID, workload.Name(), service)
|
2017-08-07 22:54:05 +00:00
|
|
|
sreg := &ServiceRegistration{
|
2021-01-22 19:45:26 +00:00
|
|
|
serviceID: id,
|
|
|
|
checkIDs: make(map[string]struct{}, len(service.Checks)),
|
|
|
|
CheckOnUpdate: make(map[string]string, len(service.Checks)),
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
|
2017-12-05 19:39:42 +00:00
|
|
|
// Service address modes default to auto
|
2017-06-09 17:29:41 +00:00
|
|
|
addrMode := service.AddressMode
|
2017-12-05 19:39:42 +00:00
|
|
|
if addrMode == "" {
|
|
|
|
addrMode = structs.AddressModeAuto
|
2017-06-09 17:29:41 +00:00
|
|
|
}
|
2017-12-05 19:39:42 +00:00
|
|
|
|
|
|
|
// Determine the address to advertise based on the mode
|
2020-10-15 19:32:21 +00:00
|
|
|
ip, port, err := getAddress(addrMode, service.PortLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus)
|
2017-12-05 19:39:42 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
|
2017-06-09 17:29:41 +00:00
|
|
|
}
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2018-04-23 23:34:53 +00:00
|
|
|
// Determine whether to use tags or canary_tags
|
|
|
|
var tags []string
|
2019-11-18 18:04:01 +00:00
|
|
|
if workload.Canary && len(service.CanaryTags) > 0 {
|
2018-04-23 23:34:53 +00:00
|
|
|
tags = make([]string, len(service.CanaryTags))
|
|
|
|
copy(tags, service.CanaryTags)
|
|
|
|
} else {
|
|
|
|
tags = make([]string, len(service.Tags))
|
|
|
|
copy(tags, service.Tags)
|
|
|
|
}
|
|
|
|
|
2019-08-14 22:02:00 +00:00
|
|
|
// newConnect returns (nil, nil) if there's no Connect-enabled service.
|
2021-02-09 12:05:28 +00:00
|
|
|
connect, err := newConnect(id, service.Name, service.Connect, workload.Networks, workload.Ports)
|
2019-08-14 22:02:00 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid Consul Connect configuration for service %q: %v", service.Name, err)
|
|
|
|
}
|
|
|
|
|
2020-07-28 20:12:08 +00:00
|
|
|
// newConnectGateway returns nil if there's no Connect gateway.
|
|
|
|
gateway := newConnectGateway(service.Name, service.Connect)
|
|
|
|
|
2020-01-27 17:55:52 +00:00
|
|
|
// Determine whether to use meta or canary_meta
|
2019-11-13 03:27:54 +00:00
|
|
|
var meta map[string]string
|
2020-01-27 17:55:52 +00:00
|
|
|
if workload.Canary && len(service.CanaryMeta) > 0 {
|
2019-11-13 03:27:54 +00:00
|
|
|
meta = make(map[string]string, len(service.CanaryMeta)+1)
|
|
|
|
for k, v := range service.CanaryMeta {
|
|
|
|
meta[k] = v
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
meta = make(map[string]string, len(service.Meta)+1)
|
|
|
|
for k, v := range service.Meta {
|
|
|
|
meta[k] = v
|
|
|
|
}
|
2019-08-23 16:49:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// This enables the consul UI to show that Nomad registered this service
|
|
|
|
meta["external-source"] = "nomad"
|
|
|
|
|
2020-12-15 20:38:33 +00:00
|
|
|
// Explicitly set the Consul service Kind in case this service represents
|
|
|
|
// one of the Connect gateway types.
|
2020-07-28 20:12:08 +00:00
|
|
|
kind := api.ServiceKindTypical
|
2020-12-15 20:38:33 +00:00
|
|
|
switch {
|
|
|
|
case service.Connect.IsIngress():
|
2020-07-28 20:12:08 +00:00
|
|
|
kind = api.ServiceKindIngressGateway
|
2020-12-15 20:38:33 +00:00
|
|
|
case service.Connect.IsTerminating():
|
|
|
|
kind = api.ServiceKindTerminatingGateway
|
|
|
|
// set the default port if bridge / default listener set
|
|
|
|
if defaultBind, exists := service.Connect.Gateway.Proxy.EnvoyGatewayBindAddresses["default"]; exists {
|
|
|
|
portLabel := fmt.Sprintf("%s-%s", structs.ConnectTerminatingPrefix, service.Name)
|
|
|
|
if dynPort, ok := workload.Ports.Get(portLabel); ok {
|
|
|
|
defaultBind.Port = dynPort.Value
|
|
|
|
}
|
|
|
|
}
|
2020-07-28 20:12:08 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// Build the Consul Service registration request
|
2017-04-04 00:08:08 +00:00
|
|
|
serviceReg := &api.AgentServiceRegistration{
|
2020-07-28 20:12:08 +00:00
|
|
|
Kind: kind,
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
ID: id,
|
|
|
|
Name: service.Name,
|
2021-03-16 18:22:21 +00:00
|
|
|
Namespace: workload.ConsulNamespace,
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
Tags: tags,
|
|
|
|
EnableTagOverride: service.EnableTagOverride,
|
|
|
|
Address: ip,
|
|
|
|
Port: port,
|
|
|
|
Meta: meta,
|
|
|
|
Connect: connect, // will be nil if no Connect stanza
|
2020-07-28 20:12:08 +00:00
|
|
|
Proxy: gateway, // will be nil if no Connect Gateway stanza
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
ops.regServices = append(ops.regServices, serviceReg)
|
2017-08-07 22:54:05 +00:00
|
|
|
|
|
|
|
// Build the check registrations
|
2021-01-22 19:45:26 +00:00
|
|
|
checkRegs, err := c.checkRegs(id, service, workload, sreg)
|
2017-08-07 22:54:05 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-01-06 19:11:28 +00:00
|
|
|
for _, registration := range checkRegs {
|
|
|
|
sreg.checkIDs[registration.ID] = struct{}{}
|
|
|
|
ops.regChecks = append(ops.regChecks, registration)
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
2021-01-06 19:11:28 +00:00
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
return sreg, nil
|
2017-06-16 23:35:16 +00:00
|
|
|
}
|
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
// checkRegs creates check registrations for the given service
|
|
|
|
func (c *ServiceClient) checkRegs(serviceID string, service *structs.Service,
|
2021-01-22 19:45:26 +00:00
|
|
|
workload *WorkloadServices, sreg *ServiceRegistration) ([]*api.AgentCheckRegistration, error) {
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
registrations := make([]*api.AgentCheckRegistration, 0, len(service.Checks))
|
2017-04-04 00:08:08 +00:00
|
|
|
for _, check := range service.Checks {
|
2021-01-06 19:11:28 +00:00
|
|
|
var ip string
|
|
|
|
var port int
|
2017-04-19 19:18:06 +00:00
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
if check.Type != structs.ServiceCheckScript {
|
|
|
|
portLabel := check.PortLabel
|
|
|
|
if portLabel == "" {
|
|
|
|
portLabel = service.PortLabel
|
|
|
|
}
|
2017-12-05 19:39:42 +00:00
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
addrMode := check.AddressMode
|
|
|
|
if addrMode == "" {
|
|
|
|
// pre-#3380 compat
|
|
|
|
addrMode = structs.AddressModeHost
|
|
|
|
}
|
2017-12-05 19:39:42 +00:00
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
var err error
|
|
|
|
ip, port, err = getAddress(addrMode, portLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
|
|
|
|
}
|
2017-12-05 19:39:42 +00:00
|
|
|
}
|
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
checkID := MakeCheckID(serviceID, check)
|
2021-03-16 18:22:21 +00:00
|
|
|
registration, err := createCheckReg(serviceID, checkID, check, ip, port, workload.ConsulNamespace)
|
2017-04-04 00:08:08 +00:00
|
|
|
if err != nil {
|
2017-08-07 22:54:05 +00:00
|
|
|
return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
2021-01-22 19:45:26 +00:00
|
|
|
sreg.CheckOnUpdate[checkID] = check.OnUpdate
|
2021-01-06 19:11:28 +00:00
|
|
|
registrations = append(registrations, registration)
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
2021-01-06 19:11:28 +00:00
|
|
|
|
|
|
|
return registrations, nil
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// RegisterWorkload with Consul. Adds all service entries and checks to Consul.
|
2017-02-01 00:43:57 +00:00
|
|
|
//
|
2017-06-09 17:29:41 +00:00
|
|
|
// If the service IP is set it used as the address in the service registration.
|
|
|
|
// Checks will always use the IP from the Task struct (host's IP).
|
|
|
|
//
|
2018-03-11 17:41:50 +00:00
|
|
|
// Actual communication with Consul is done asynchronously (see Run).
|
2019-11-18 18:04:01 +00:00
|
|
|
func (c *ServiceClient) RegisterWorkload(workload *WorkloadServices) error {
|
2017-08-07 22:54:05 +00:00
|
|
|
// Fast path
|
2019-11-18 18:04:01 +00:00
|
|
|
numServices := len(workload.Services)
|
2017-08-07 22:54:05 +00:00
|
|
|
if numServices == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
t := new(ServiceRegistrations)
|
2017-08-07 22:54:05 +00:00
|
|
|
t.Services = make(map[string]*ServiceRegistration, numServices)
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
ops := &operations{}
|
2019-11-18 18:04:01 +00:00
|
|
|
for _, service := range workload.Services {
|
|
|
|
sreg, err := c.serviceRegs(ops, service, workload)
|
2017-08-07 22:54:05 +00:00
|
|
|
if err != nil {
|
2017-04-04 00:08:08 +00:00
|
|
|
return err
|
|
|
|
}
|
2017-08-07 22:54:05 +00:00
|
|
|
t.Services[sreg.serviceID] = sreg
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// Add the workload to the allocation's registration
|
|
|
|
c.addRegistrations(workload.AllocID, workload.Name(), t)
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
c.commit(ops)
|
2017-08-26 05:40:18 +00:00
|
|
|
|
|
|
|
// Start watching checks. Done after service registrations are built
|
|
|
|
// since an error building them could leak watches.
|
2019-11-18 18:04:01 +00:00
|
|
|
for _, service := range workload.Services {
|
|
|
|
serviceID := MakeAllocServiceID(workload.AllocID, workload.Name(), service)
|
2017-08-26 05:40:18 +00:00
|
|
|
for _, check := range service.Checks {
|
2017-09-14 16:58:35 +00:00
|
|
|
if check.TriggersRestarts() {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
checkID := MakeCheckID(serviceID, check)
|
2019-11-18 18:04:01 +00:00
|
|
|
c.checkWatcher.Watch(workload.AllocID, workload.Name(), checkID, check, workload.Restarter)
|
2017-08-26 05:40:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-04-04 00:08:08 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// UpdateWorkload in Consul. Does not alter the service if only checks have
|
2017-04-04 00:08:08 +00:00
|
|
|
// changed.
|
2017-06-16 23:35:16 +00:00
|
|
|
//
|
|
|
|
// DriverNetwork must not change between invocations for the same allocation.
|
2019-11-18 18:04:01 +00:00
|
|
|
func (c *ServiceClient) UpdateWorkload(old, newWorkload *WorkloadServices) error {
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
ops := new(operations)
|
2019-11-18 18:04:01 +00:00
|
|
|
regs := new(ServiceRegistrations)
|
|
|
|
regs.Services = make(map[string]*ServiceRegistration, len(newWorkload.Services))
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
newIDs := make(map[string]*structs.Service, len(newWorkload.Services))
|
|
|
|
for _, s := range newWorkload.Services {
|
|
|
|
newIDs[MakeAllocServiceID(newWorkload.AllocID, newWorkload.Name(), s)] = s
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
// Loop over existing Services to see if they have been removed
|
|
|
|
for _, existingSvc := range old.Services {
|
|
|
|
existingID := MakeAllocServiceID(old.AllocID, old.Name(), existingSvc)
|
2017-04-04 00:08:08 +00:00
|
|
|
newSvc, ok := newIDs[existingID]
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
|
2017-04-04 00:08:08 +00:00
|
|
|
if !ok {
|
2017-08-07 21:13:05 +00:00
|
|
|
// Existing service entry removed
|
2017-04-08 00:10:26 +00:00
|
|
|
ops.deregServices = append(ops.deregServices, existingID)
|
2017-04-04 00:08:08 +00:00
|
|
|
for _, check := range existingSvc.Checks {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
cid := MakeCheckID(existingID, check)
|
2017-08-26 05:40:18 +00:00
|
|
|
ops.deregChecks = append(ops.deregChecks, cid)
|
|
|
|
|
|
|
|
// Unwatch watched checks
|
2017-09-14 16:58:35 +00:00
|
|
|
if check.TriggersRestarts() {
|
2017-08-26 05:40:18 +00:00
|
|
|
c.checkWatcher.Unwatch(cid)
|
|
|
|
}
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
|
|
|
continue
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
oldHash := existingSvc.Hash(old.AllocID, old.Name(), old.Canary)
|
|
|
|
newHash := newSvc.Hash(newWorkload.AllocID, newWorkload.Name(), newWorkload.Canary)
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
if oldHash == newHash {
|
|
|
|
// Service exists and hasn't changed, don't re-add it later
|
|
|
|
delete(newIDs, existingID)
|
|
|
|
}
|
2017-12-08 01:08:25 +00:00
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// Service still exists so add it to the task's registration
|
|
|
|
sreg := &ServiceRegistration{
|
2021-01-22 19:45:26 +00:00
|
|
|
serviceID: existingID,
|
|
|
|
checkIDs: make(map[string]struct{}, len(newSvc.Checks)),
|
|
|
|
CheckOnUpdate: make(map[string]string, len(newSvc.Checks)),
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
2019-11-18 18:04:01 +00:00
|
|
|
regs.Services[existingID] = sreg
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2017-12-08 01:08:25 +00:00
|
|
|
// See if any checks were updated
|
2017-08-26 05:40:18 +00:00
|
|
|
existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
|
2017-04-04 00:08:08 +00:00
|
|
|
for _, check := range existingSvc.Checks {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
existingChecks[MakeCheckID(existingID, check)] = check
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Register new checks
|
|
|
|
for _, check := range newSvc.Checks {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
checkID := MakeCheckID(existingID, check)
|
2017-04-08 00:10:26 +00:00
|
|
|
if _, exists := existingChecks[checkID]; exists {
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
// Check is still required. Remove it from the map so it doesn't get
|
|
|
|
// deleted later.
|
2017-04-08 00:10:26 +00:00
|
|
|
delete(existingChecks, checkID)
|
2017-08-07 22:54:05 +00:00
|
|
|
sreg.checkIDs[checkID] = struct{}{}
|
2021-01-22 19:45:26 +00:00
|
|
|
sreg.CheckOnUpdate[checkID] = check.OnUpdate
|
2017-12-08 01:08:25 +00:00
|
|
|
}
|
2017-08-26 05:40:18 +00:00
|
|
|
|
2017-12-08 01:08:25 +00:00
|
|
|
// New check on an unchanged service; add them now
|
2021-01-22 19:45:26 +00:00
|
|
|
checkRegs, err := c.checkRegs(existingID, newSvc, newWorkload, sreg)
|
2017-12-08 01:08:25 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-26 05:40:18 +00:00
|
|
|
|
2021-01-06 19:11:28 +00:00
|
|
|
for _, registration := range checkRegs {
|
|
|
|
sreg.checkIDs[registration.ID] = struct{}{}
|
2021-01-22 19:45:26 +00:00
|
|
|
sreg.CheckOnUpdate[registration.ID] = check.OnUpdate
|
2021-01-06 19:11:28 +00:00
|
|
|
ops.regChecks = append(ops.regChecks, registration)
|
2017-08-26 05:40:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update all watched checks as CheckRestart fields aren't part of ID
|
2017-09-14 16:58:35 +00:00
|
|
|
if check.TriggersRestarts() {
|
2019-11-18 18:04:01 +00:00
|
|
|
c.checkWatcher.Watch(newWorkload.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// Remove existing checks not in updated service
|
2017-08-26 05:40:18 +00:00
|
|
|
for cid, check := range existingChecks {
|
2017-04-08 00:10:26 +00:00
|
|
|
ops.deregChecks = append(ops.deregChecks, cid)
|
2017-08-26 05:40:18 +00:00
|
|
|
|
|
|
|
// Unwatch checks
|
2017-09-14 16:58:35 +00:00
|
|
|
if check.TriggersRestarts() {
|
2017-08-26 05:40:18 +00:00
|
|
|
c.checkWatcher.Unwatch(cid)
|
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2017-04-04 00:08:08 +00:00
|
|
|
// Any remaining services should just be enqueued directly
|
|
|
|
for _, newSvc := range newIDs {
|
2019-11-18 18:04:01 +00:00
|
|
|
sreg, err := c.serviceRegs(ops, newSvc, newWorkload)
|
2017-04-04 00:08:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
regs.Services[sreg.serviceID] = sreg
|
2017-04-04 00:08:08 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// Add the task to the allocation's registration
|
2019-11-18 18:04:01 +00:00
|
|
|
c.addRegistrations(newWorkload.AllocID, newWorkload.Name(), regs)
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
c.commit(ops)
|
2017-08-26 05:40:18 +00:00
|
|
|
|
|
|
|
// Start watching checks. Done after service registrations are built
|
|
|
|
// since an error building them could leak watches.
|
2021-01-06 19:11:28 +00:00
|
|
|
for serviceID, service := range newIDs {
|
2017-08-26 05:40:18 +00:00
|
|
|
for _, check := range service.Checks {
|
2017-09-14 16:58:35 +00:00
|
|
|
if check.TriggersRestarts() {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
checkID := MakeCheckID(serviceID, check)
|
2019-11-18 18:04:01 +00:00
|
|
|
c.checkWatcher.Watch(newWorkload.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter)
|
2017-08-26 05:40:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// RemoveWorkload from Consul. Removes all service entries and checks.
|
2017-02-01 00:43:57 +00:00
|
|
|
//
|
2018-03-11 17:41:50 +00:00
|
|
|
// Actual communication with Consul is done asynchronously (see Run).
|
2019-11-18 18:04:01 +00:00
|
|
|
func (c *ServiceClient) RemoveWorkload(workload *WorkloadServices) {
|
2017-04-08 00:10:26 +00:00
|
|
|
ops := operations{}
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
for _, service := range workload.Services {
|
|
|
|
id := MakeAllocServiceID(workload.AllocID, workload.Name(), service)
|
2017-04-08 00:10:26 +00:00
|
|
|
ops.deregServices = append(ops.deregServices, id)
|
2017-02-01 00:43:57 +00:00
|
|
|
|
|
|
|
for _, check := range service.Checks {
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
cid := MakeCheckID(id, check)
|
2017-08-26 05:40:18 +00:00
|
|
|
ops.deregChecks = append(ops.deregChecks, cid)
|
|
|
|
|
2017-09-14 16:58:35 +00:00
|
|
|
if check.TriggersRestarts() {
|
2017-08-26 05:40:18 +00:00
|
|
|
c.checkWatcher.Unwatch(cid)
|
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// Remove the workload from the alloc's registrations
|
|
|
|
c.removeRegistration(workload.AllocID, workload.Name())
|
2017-08-07 22:54:05 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// Now add them to the deregistration fields; main Run loop will update
|
2017-04-08 00:10:26 +00:00
|
|
|
c.commit(&ops)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// normalizeNamespace will turn the "default" namespace into the empty string,
|
|
|
|
// so that Consul OSS will not produce an error setting something in the default
|
|
|
|
// namespace.
|
|
|
|
func normalizeNamespace(namespace string) string {
|
|
|
|
if namespace == "default" {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
return namespace
|
|
|
|
}
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// AllocRegistrations returns the registrations for the given allocation. If the
|
2021-03-16 18:22:21 +00:00
|
|
|
// allocation has no registrations, the response is a nil object.
|
2017-08-07 22:54:05 +00:00
|
|
|
func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
|
|
|
|
// Get the internal struct using the lock
|
|
|
|
c.allocRegistrationsLock.RLock()
|
|
|
|
regInternal, ok := c.allocRegistrations[allocID]
|
|
|
|
if !ok {
|
|
|
|
c.allocRegistrationsLock.RUnlock()
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copy so we don't expose internal structs
|
2017-08-10 20:07:03 +00:00
|
|
|
reg := regInternal.copy()
|
2017-08-07 22:54:05 +00:00
|
|
|
c.allocRegistrationsLock.RUnlock()
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Get the list of all namespaces created so we can iterate them.
|
|
|
|
namespaces, err := c.namespacesClient.List()
|
2017-08-07 22:54:05 +00:00
|
|
|
if err != nil {
|
2021-03-16 18:22:21 +00:00
|
|
|
return nil, errors.Wrap(err, "failed to retrieve namespaces from consul")
|
2017-07-04 19:24:27 +00:00
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
services := make(map[string]*api.AgentService)
|
|
|
|
checks := make(map[string]*api.AgentCheck)
|
|
|
|
|
|
|
|
// Query the services and checks to populate the allocation registrations.
|
|
|
|
for _, namespace := range namespaces {
|
|
|
|
nsServices, err := c.agentAPI.ServicesWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)})
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "failed to retrieve services from consul")
|
|
|
|
}
|
|
|
|
for k, v := range nsServices {
|
|
|
|
services[k] = v
|
|
|
|
}
|
|
|
|
|
|
|
|
nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)})
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "failed to retrieve checks from consul")
|
|
|
|
}
|
|
|
|
for k, v := range nsChecks {
|
|
|
|
checks[k] = v
|
|
|
|
}
|
2017-07-04 19:24:27 +00:00
|
|
|
}
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
// Populate the object
|
|
|
|
for _, treg := range reg.Tasks {
|
|
|
|
for serviceID, sreg := range treg.Services {
|
|
|
|
sreg.Service = services[serviceID]
|
|
|
|
for checkID := range sreg.checkIDs {
|
|
|
|
if check, ok := checks[checkID]; ok {
|
|
|
|
sreg.Checks = append(sreg.Checks, check)
|
|
|
|
}
|
|
|
|
}
|
2017-07-04 19:24:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-07 22:54:05 +00:00
|
|
|
return reg, nil
|
2017-07-04 19:24:27 +00:00
|
|
|
}
|
|
|
|
|
2019-09-16 20:26:06 +00:00
|
|
|
// UpdateTTL is used to update the TTL of a check. Typically this will only be
|
|
|
|
// called to heartbeat script checks.
|
2021-03-16 18:22:21 +00:00
|
|
|
func (c *ServiceClient) UpdateTTL(id, namespace, output, status string) error {
|
|
|
|
return c.agentAPI.UpdateTTLOpts(id, output, status, &api.QueryOptions{Namespace: normalizeNamespace(namespace)})
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
}
|
|
|
|
|
2018-03-11 18:40:53 +00:00
|
|
|
// Shutdown the Consul client. Update running task registrations and deregister
|
2017-04-18 00:07:42 +00:00
|
|
|
// agent from Consul. On first call blocks up to shutdownWait before giving up
|
|
|
|
// on syncing operations.
|
2017-02-01 00:43:57 +00:00
|
|
|
func (c *ServiceClient) Shutdown() error {
|
2017-04-18 00:07:42 +00:00
|
|
|
// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
|
|
|
|
// entries.
|
|
|
|
c.agentLock.Lock()
|
2017-07-24 18:40:37 +00:00
|
|
|
defer c.agentLock.Unlock()
|
2017-02-01 00:43:57 +00:00
|
|
|
select {
|
|
|
|
case <-c.shutdownCh:
|
|
|
|
return nil
|
|
|
|
default:
|
2017-07-24 18:40:37 +00:00
|
|
|
close(c.shutdownCh)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2017-04-12 19:07:10 +00:00
|
|
|
// Give run loop time to sync, but don't block indefinitely
|
|
|
|
deadline := time.After(c.shutdownWait)
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// Wait for Run to finish any outstanding operations and exit
|
2017-02-01 00:43:57 +00:00
|
|
|
select {
|
2017-04-08 00:10:26 +00:00
|
|
|
case <-c.exitCh:
|
2017-02-01 00:43:57 +00:00
|
|
|
case <-deadline:
|
|
|
|
// Don't wait forever though
|
2017-07-24 18:40:37 +00:00
|
|
|
}
|
|
|
|
|
2017-07-24 23:48:40 +00:00
|
|
|
// If Consul was never seen nothing could be written so exit early
|
|
|
|
if !c.hasSeen() {
|
2017-07-24 19:12:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-07-24 18:40:37 +00:00
|
|
|
// Always attempt to deregister Nomad agent Consul entries, even if
|
|
|
|
// deadline was reached
|
|
|
|
for id := range c.agentServices {
|
2021-03-16 18:22:21 +00:00
|
|
|
if err := c.agentAPI.ServiceDeregisterOpts(id, nil); err != nil {
|
2018-09-13 17:43:40 +00:00
|
|
|
c.logger.Error("failed deregistering agent service", "service_id", id, "error", err)
|
2017-07-24 18:40:37 +00:00
|
|
|
}
|
|
|
|
}
|
2020-10-06 00:30:29 +00:00
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
namespaces, err := c.namespacesClient.List()
|
2020-10-06 00:30:29 +00:00
|
|
|
if err != nil {
|
2021-03-16 18:22:21 +00:00
|
|
|
c.logger.Error("failed to retrieve namespaces from consul", "error", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
remainingChecks := make(map[string]*api.AgentCheck)
|
|
|
|
for _, namespace := range namespaces {
|
|
|
|
nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)})
|
|
|
|
if err != nil {
|
|
|
|
c.logger.Error("failed to retrieve checks from consul", "error", err)
|
|
|
|
}
|
|
|
|
for k, v := range nsChecks {
|
|
|
|
remainingChecks[k] = v
|
|
|
|
}
|
2020-10-06 00:30:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
checkRemains := func(id string) bool {
|
|
|
|
for _, c := range remainingChecks {
|
|
|
|
if c.CheckID == id {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2017-07-24 18:40:37 +00:00
|
|
|
for id := range c.agentChecks {
|
2020-10-06 00:30:29 +00:00
|
|
|
// if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway
|
|
|
|
// if we could list the remaining checks, verify that the check we store still exists before removing it.
|
|
|
|
if remainingChecks == nil || checkRemains(id) {
|
2021-03-16 18:22:21 +00:00
|
|
|
ns := remainingChecks[id].Namespace
|
|
|
|
if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: ns}); err != nil {
|
2020-10-06 00:30:29 +00:00
|
|
|
c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
|
|
|
|
}
|
2017-07-24 18:40:37 +00:00
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2017-04-12 19:07:10 +00:00
|
|
|
return nil
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// addRegistration adds the service registrations for the given allocation.
|
|
|
|
func (c *ServiceClient) addRegistrations(allocID, taskName string, reg *ServiceRegistrations) {
|
2017-08-07 22:54:05 +00:00
|
|
|
c.allocRegistrationsLock.Lock()
|
|
|
|
defer c.allocRegistrationsLock.Unlock()
|
|
|
|
|
|
|
|
alloc, ok := c.allocRegistrations[allocID]
|
|
|
|
if !ok {
|
|
|
|
alloc = &AllocRegistration{
|
2019-11-18 18:04:01 +00:00
|
|
|
Tasks: make(map[string]*ServiceRegistrations),
|
2017-08-07 22:54:05 +00:00
|
|
|
}
|
|
|
|
c.allocRegistrations[allocID] = alloc
|
|
|
|
}
|
|
|
|
alloc.Tasks[taskName] = reg
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// removeRegistrations removes the registration for the given allocation.
|
|
|
|
func (c *ServiceClient) removeRegistration(allocID, taskName string) {
|
2017-08-07 22:54:05 +00:00
|
|
|
c.allocRegistrationsLock.Lock()
|
|
|
|
defer c.allocRegistrationsLock.Unlock()
|
|
|
|
|
|
|
|
alloc, ok := c.allocRegistrations[allocID]
|
|
|
|
if !ok {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete the task and if it is the last one also delete the alloc's
|
|
|
|
// registration
|
|
|
|
delete(alloc.Tasks, taskName)
|
|
|
|
if len(alloc.Tasks) == 0 {
|
|
|
|
delete(c.allocRegistrations, allocID)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
// makeAgentServiceID creates a unique ID for identifying an agent service in
|
|
|
|
// Consul.
|
|
|
|
//
|
|
|
|
// Agent service IDs are of the form:
|
|
|
|
//
|
2017-12-08 01:08:25 +00:00
|
|
|
// {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
|
2017-12-12 00:50:15 +00:00
|
|
|
// Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
|
|
|
|
// Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
|
2017-02-01 00:43:57 +00:00
|
|
|
//
|
|
|
|
func makeAgentServiceID(role string, service *structs.Service) string {
|
2018-04-23 23:34:53 +00:00
|
|
|
return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false))
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// MakeAllocServiceID creates a unique ID for identifying an alloc service in
|
2019-06-13 12:57:27 +00:00
|
|
|
// Consul.
|
2017-02-01 00:43:57 +00:00
|
|
|
//
|
2019-06-13 12:57:27 +00:00
|
|
|
// Example Service ID: _nomad-task-b4e61df9-b095-d64e-f241-23860da1375f-redis-http-http
|
2019-11-18 18:04:01 +00:00
|
|
|
func MakeAllocServiceID(allocID, taskName string, service *structs.Service) string {
|
2019-06-13 12:57:27 +00:00
|
|
|
return fmt.Sprintf("%s%s-%s-%s-%s", nomadTaskPrefix, allocID, taskName, service.Name, service.PortLabel)
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
// MakeCheckID creates a unique ID for a check.
|
2019-05-09 11:22:22 +00:00
|
|
|
//
|
|
|
|
// Example Check ID: _nomad-check-434ae42f9a57c5705344974ac38de2aee0ee089d
|
support script checks for task group services (#6197)
In Nomad prior to Consul Connect, all Consul checks work the same
except for Script checks. Because the Task being checked is running in
its own container namespaces, the check is executed by Nomad in the
Task's context. If the Script check passes, Nomad uses the TTL check
feature of Consul to update the check status. This means in order to
run a Script check, we need to know what Task to execute it in.
To support Consul Connect, we need Group Services, and these need to
be registered in Consul along with their checks. We could push the
Service down into the Task, but this doesn't work if someone wants to
associate a service with a task's ports, but do script checks in
another task in the allocation.
Because Nomad is handling the Script check and not Consul anyways,
this moves the script check handling into the task runner so that the
task runner can own the script check's configuration and
lifecycle. This will allow us to pass the group service check
configuration down into a task without associating the service itself
with the task.
When tasks are checked for script checks, we walk back through their
task group to see if there are script checks associated with the
task. If so, we'll spin off script check tasklets for them. The
group-level service and any restart behaviors it needs are entirely
encapsulated within the group service hook.
2019-09-03 19:09:04 +00:00
|
|
|
func MakeCheckID(serviceID string, check *structs.ServiceCheck) string {
|
consul: Use a stable identifier for services
The current implementation of Service Registration uses a hash of the
nomad-internal state of a service to register it with Consul, this means that
any update to the service invalidates this name and we then deregister, and
recreate the service in Consul.
While this behaviour slightly simplifies reasoning about service registration,
this becomes problematic when we add consul health checks to a service. When
the service is re-registered, so are the checks, which default to failing for
at least one check period.
This commit migrates us to using a stable identifier based on the
allocation, task, and service identifiers, and uses the difference
between the remote and local state to decide when to push updates.
It uses the existing hashing mechanic to decide when UpdateTask should
regenerate service registrations for providing to Sync, but this should
be removable as part of a future refactor.
It additionally introduces the _nomad-check- prefix for check
definitions, to allow for future allowing of consul features like
maintenance mode.
2019-04-10 08:39:24 +00:00
|
|
|
return fmt.Sprintf("%s%s", nomadCheckPrefix, check.Hash(serviceID))
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// createCheckReg creates a Check that can be registered with Consul.
|
|
|
|
//
|
2017-04-12 20:27:56 +00:00
|
|
|
// Script checks simply have a TTL set and the caller is responsible for
|
2020-03-07 03:15:22 +00:00
|
|
|
// running the script and heart-beating.
|
2021-03-16 18:22:21 +00:00
|
|
|
func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int, namespace string) (*api.AgentCheckRegistration, error) {
|
2017-02-01 00:43:57 +00:00
|
|
|
chkReg := api.AgentCheckRegistration{
|
|
|
|
ID: checkID,
|
|
|
|
Name: check.Name,
|
|
|
|
ServiceID: serviceID,
|
2021-03-16 18:22:21 +00:00
|
|
|
Namespace: normalizeNamespace(namespace),
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
|
|
|
chkReg.Status = check.InitialStatus
|
|
|
|
chkReg.Timeout = check.Timeout.String()
|
|
|
|
chkReg.Interval = check.Interval.String()
|
2020-08-08 01:22:06 +00:00
|
|
|
chkReg.SuccessBeforePassing = check.SuccessBeforePassing
|
|
|
|
chkReg.FailuresBeforeCritical = check.FailuresBeforeCritical
|
2017-02-01 00:43:57 +00:00
|
|
|
|
2017-12-19 00:18:42 +00:00
|
|
|
// Require an address for http or tcp checks
|
|
|
|
if port == 0 && check.RequiresPort() {
|
|
|
|
return nil, fmt.Errorf("%s checks require an address", check.Type)
|
|
|
|
}
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
switch check.Type {
|
|
|
|
case structs.ServiceCheckHTTP:
|
2017-04-26 18:22:01 +00:00
|
|
|
proto := check.Protocol
|
|
|
|
if proto == "" {
|
|
|
|
proto = "http"
|
2017-02-01 00:43:57 +00:00
|
|
|
}
|
2017-04-19 04:28:25 +00:00
|
|
|
if check.TLSSkipVerify {
|
|
|
|
chkReg.TLSSkipVerify = true
|
|
|
|
}
|
2017-02-01 00:43:57 +00:00
|
|
|
base := url.URL{
|
2017-04-26 18:22:01 +00:00
|
|
|
Scheme: proto,
|
2017-02-01 00:43:57 +00:00
|
|
|
Host: net.JoinHostPort(host, strconv.Itoa(port)),
|
|
|
|
}
|
|
|
|
relative, err := url.Parse(check.Path)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-03-07 03:15:22 +00:00
|
|
|
checkURL := base.ResolveReference(relative)
|
|
|
|
chkReg.HTTP = checkURL.String()
|
2017-08-15 23:13:05 +00:00
|
|
|
chkReg.Method = check.Method
|
|
|
|
chkReg.Header = check.Header
|
2018-05-02 23:49:47 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
case structs.ServiceCheckTCP:
|
|
|
|
chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
|
2018-05-02 23:49:47 +00:00
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
case structs.ServiceCheckScript:
|
|
|
|
chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
|
2017-10-17 00:35:47 +00:00
|
|
|
// As of Consul 1.0.0 setting TTL and Interval is a 400
|
|
|
|
chkReg.Interval = ""
|
2018-05-02 23:49:47 +00:00
|
|
|
|
|
|
|
case structs.ServiceCheckGRPC:
|
2018-05-03 22:18:12 +00:00
|
|
|
chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService)
|
2018-05-02 23:49:47 +00:00
|
|
|
chkReg.GRPCUseTLS = check.GRPCUseTLS
|
|
|
|
if check.TLSSkipVerify {
|
|
|
|
chkReg.TLSSkipVerify = true
|
|
|
|
}
|
|
|
|
|
2017-02-01 00:43:57 +00:00
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("check type %+q not valid", check.Type)
|
|
|
|
}
|
|
|
|
return &chkReg, nil
|
|
|
|
}
|
2017-04-08 00:10:26 +00:00
|
|
|
|
2019-04-25 11:48:19 +00:00
|
|
|
// isNomadCheck returns true if the ID matches the pattern of a Nomad managed
|
|
|
|
// check.
|
|
|
|
func isNomadCheck(id string) bool {
|
|
|
|
return strings.HasPrefix(id, nomadCheckPrefix)
|
|
|
|
}
|
|
|
|
|
2017-04-08 00:10:26 +00:00
|
|
|
// isNomadService returns true if the ID matches the pattern of a Nomad managed
|
2017-12-08 01:08:25 +00:00
|
|
|
// service (new or old formats). Agent services return false as independent
|
|
|
|
// client and server agents may be running on the same machine. #2827
|
2017-04-08 00:10:26 +00:00
|
|
|
func isNomadService(id string) bool {
|
2017-12-08 01:08:25 +00:00
|
|
|
return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
|
|
|
|
}
|
|
|
|
|
|
|
|
// isOldNomadService returns true if the ID matches an old pattern managed by
|
|
|
|
// Nomad.
|
|
|
|
//
|
|
|
|
// Pre-0.7.1 task service IDs are of the form:
|
|
|
|
//
|
|
|
|
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
|
|
|
|
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
|
|
|
|
//
|
|
|
|
func isOldNomadService(id string) bool {
|
2017-07-18 20:23:01 +00:00
|
|
|
const prefix = nomadServicePrefix + "-executor"
|
|
|
|
return strings.HasPrefix(id, prefix)
|
2017-04-08 00:10:26 +00:00
|
|
|
}
|
2017-12-05 19:39:42 +00:00
|
|
|
|
2020-02-14 19:44:34 +00:00
|
|
|
const (
|
|
|
|
sidecarSuffix = "-sidecar-proxy"
|
|
|
|
)
|
|
|
|
|
2019-08-30 18:05:30 +00:00
|
|
|
// isNomadSidecar returns true if the ID matches a sidecar proxy for a Nomad
|
|
|
|
// managed service.
|
|
|
|
//
|
|
|
|
// For example if you have a Connect enabled service with the ID:
|
|
|
|
//
|
|
|
|
// _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db
|
|
|
|
//
|
|
|
|
// Consul will create a service for the sidecar proxy with the ID:
|
|
|
|
//
|
|
|
|
// _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db-sidecar-proxy
|
|
|
|
//
|
|
|
|
func isNomadSidecar(id string, services map[string]*api.AgentServiceRegistration) bool {
|
2020-02-14 19:44:34 +00:00
|
|
|
if !strings.HasSuffix(id, sidecarSuffix) {
|
2019-08-30 18:05:30 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure the Nomad managed service for this proxy still exists.
|
2020-02-14 19:44:34 +00:00
|
|
|
_, ok := services[id[:len(id)-len(sidecarSuffix)]]
|
2019-08-30 18:05:30 +00:00
|
|
|
return ok
|
|
|
|
}
|
|
|
|
|
2020-02-14 19:44:34 +00:00
|
|
|
// getNomadSidecar returns the service registration of the sidecar for the managed
|
|
|
|
// service with the specified id.
|
|
|
|
//
|
|
|
|
// If the managed service of the specified id does not exist, or the service does
|
|
|
|
// not have a sidecar proxy, nil is returned.
|
|
|
|
func getNomadSidecar(id string, services map[string]*api.AgentService) *api.AgentService {
|
|
|
|
if _, exists := services[id]; !exists {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
sidecarID := id + sidecarSuffix
|
|
|
|
return services[sidecarID]
|
|
|
|
}
|
|
|
|
|
2017-12-19 00:18:42 +00:00
|
|
|
// getAddress returns the IP and port to use for a service or check. If no port
|
|
|
|
// label is specified (an empty value), zero values are returned because no
|
|
|
|
// address could be resolved.
|
2020-10-15 19:32:21 +00:00
|
|
|
func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *drivers.DriverNetwork, ports structs.AllocatedPorts, netStatus *structs.AllocNetworkStatus) (string, int, error) {
|
2017-12-05 19:39:42 +00:00
|
|
|
switch addrMode {
|
|
|
|
case structs.AddressModeAuto:
|
|
|
|
if driverNet.Advertise() {
|
|
|
|
addrMode = structs.AddressModeDriver
|
|
|
|
} else {
|
|
|
|
addrMode = structs.AddressModeHost
|
|
|
|
}
|
2020-10-15 19:32:21 +00:00
|
|
|
return getAddress(addrMode, portLabel, networks, driverNet, ports, netStatus)
|
2017-12-05 19:39:42 +00:00
|
|
|
case structs.AddressModeHost:
|
2017-12-20 23:02:34 +00:00
|
|
|
if portLabel == "" {
|
|
|
|
if len(networks) != 1 {
|
|
|
|
// If no networks are specified return zero
|
|
|
|
// values. Consul will advertise the host IP
|
|
|
|
// with no port. This is the pre-0.7.1 behavior
|
|
|
|
// some people rely on.
|
|
|
|
return "", 0, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return networks[0].IP, 0, nil
|
|
|
|
}
|
|
|
|
|
2017-12-05 19:39:42 +00:00
|
|
|
// Default path: use host ip:port
|
2020-10-15 19:32:21 +00:00
|
|
|
// Try finding port in the AllocatedPorts struct first
|
|
|
|
// Check in Networks struct for backwards compatibility if not found
|
|
|
|
mapping, ok := ports.Get(portLabel)
|
|
|
|
if !ok {
|
2021-02-09 12:05:28 +00:00
|
|
|
mapping = networks.Port(portLabel)
|
|
|
|
if mapping.Value > 0 {
|
|
|
|
return mapping.HostIP, mapping.Value, nil
|
2020-11-05 20:00:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// If port isn't a label, try to parse it as a literal port number
|
|
|
|
port, err := strconv.Atoi(portLabel)
|
|
|
|
if err != nil {
|
|
|
|
// Don't include Atoi error message as user likely
|
|
|
|
// never intended it to be a numeric and it creates a
|
|
|
|
// confusing error message
|
2020-10-15 19:32:21 +00:00
|
|
|
return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
|
|
|
|
}
|
2020-11-05 20:00:22 +00:00
|
|
|
if port <= 0 {
|
|
|
|
return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
|
|
|
|
}
|
|
|
|
|
|
|
|
// A number was given which will use the Consul agent's address and the given port
|
|
|
|
// Returning a blank string as an address will use the Consul agent's address
|
|
|
|
return "", port, nil
|
2017-12-08 05:58:15 +00:00
|
|
|
}
|
2020-10-15 19:32:21 +00:00
|
|
|
return mapping.HostIP, mapping.Value, nil
|
2017-12-05 19:39:42 +00:00
|
|
|
|
|
|
|
case structs.AddressModeDriver:
|
|
|
|
// Require a driver network if driver address mode is used
|
|
|
|
if driverNet == nil {
|
|
|
|
return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
|
|
|
|
}
|
|
|
|
|
2017-12-20 23:02:34 +00:00
|
|
|
// If no port label is specified just return the IP
|
|
|
|
if portLabel == "" {
|
|
|
|
return driverNet.IP, 0, nil
|
|
|
|
}
|
|
|
|
|
2017-12-05 19:39:42 +00:00
|
|
|
// If the port is a label, use the driver's port (not the host's)
|
2020-10-15 19:32:21 +00:00
|
|
|
if port, ok := ports.Get(portLabel); ok {
|
|
|
|
return driverNet.IP, port.To, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if old style driver portmap is used
|
2017-12-05 19:39:42 +00:00
|
|
|
if port, ok := driverNet.PortMap[portLabel]; ok {
|
|
|
|
return driverNet.IP, port, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If port isn't a label, try to parse it as a literal port number
|
|
|
|
port, err := strconv.Atoi(portLabel)
|
|
|
|
if err != nil {
|
2018-01-12 23:32:51 +00:00
|
|
|
// Don't include Atoi error message as user likely
|
|
|
|
// never intended it to be a numeric and it creates a
|
|
|
|
// confusing error message
|
|
|
|
return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel)
|
2017-12-05 19:39:42 +00:00
|
|
|
}
|
2017-12-08 20:27:57 +00:00
|
|
|
if port <= 0 {
|
2018-01-12 23:32:51 +00:00
|
|
|
return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
|
2017-12-08 06:04:22 +00:00
|
|
|
}
|
2017-12-05 19:39:42 +00:00
|
|
|
|
|
|
|
return driverNet.IP, port, nil
|
|
|
|
|
2021-01-06 18:52:48 +00:00
|
|
|
case structs.AddressModeAlloc:
|
2020-10-15 19:32:21 +00:00
|
|
|
if netStatus == nil {
|
|
|
|
return "", 0, fmt.Errorf(`cannot use address_mode="alloc": no allocation network status reported`)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no port label is specified just return the IP
|
|
|
|
if portLabel == "" {
|
|
|
|
return netStatus.Address, 0, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If port is a label and is found then return it
|
|
|
|
if port, ok := ports.Get(portLabel); ok {
|
2021-01-06 18:52:48 +00:00
|
|
|
// Use port.To value unless not set
|
|
|
|
if port.To > 0 {
|
|
|
|
return netStatus.Address, port.To, nil
|
|
|
|
}
|
2020-10-15 19:32:21 +00:00
|
|
|
return netStatus.Address, port.Value, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if port is a literal number
|
|
|
|
port, err := strconv.Atoi(portLabel)
|
|
|
|
if err != nil {
|
|
|
|
// User likely specified wrong port label here
|
|
|
|
return "", 0, fmt.Errorf("invalid port %q: port label not found or is not numeric", portLabel)
|
|
|
|
}
|
|
|
|
if port <= 0 {
|
|
|
|
return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
|
|
|
|
}
|
|
|
|
return netStatus.Address, port, nil
|
|
|
|
|
2017-12-05 19:39:42 +00:00
|
|
|
default:
|
|
|
|
// Shouldn't happen due to validation, but enforce invariants
|
|
|
|
return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
|
|
|
|
}
|
|
|
|
}
|