2015-07-23 21:41:18 +00:00
|
|
|
package nomad
|
|
|
|
|
|
|
|
import (
|
2016-08-18 20:52:15 +00:00
|
|
|
"context"
|
2015-07-23 22:15:48 +00:00
|
|
|
"fmt"
|
2017-07-01 00:23:34 +00:00
|
|
|
"sort"
|
2016-08-17 00:50:14 +00:00
|
|
|
"strings"
|
2015-07-23 21:41:18 +00:00
|
|
|
"time"
|
|
|
|
|
2019-01-15 19:46:12 +00:00
|
|
|
metrics "github.com/armon/go-metrics"
|
2018-09-15 23:23:13 +00:00
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
2019-01-15 19:46:12 +00:00
|
|
|
multierror "github.com/hashicorp/go-multierror"
|
2018-09-15 23:23:13 +00:00
|
|
|
|
2016-11-26 04:02:18 +00:00
|
|
|
"github.com/golang/snappy"
|
2016-09-01 19:05:08 +00:00
|
|
|
"github.com/hashicorp/consul/lib"
|
2020-02-21 21:23:30 +00:00
|
|
|
"github.com/pkg/errors"
|
|
|
|
|
2017-08-21 04:31:45 +00:00
|
|
|
"github.com/hashicorp/nomad/acl"
|
2017-01-18 23:55:14 +00:00
|
|
|
"github.com/hashicorp/nomad/helper"
|
2017-09-29 16:58:48 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2017-02-08 04:31:23 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/state"
|
2015-07-23 21:41:18 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2016-05-05 18:21:58 +00:00
|
|
|
"github.com/hashicorp/nomad/scheduler"
|
2015-07-23 21:41:18 +00:00
|
|
|
)
|
|
|
|
|
2016-06-08 23:48:02 +00:00
|
|
|
const (
|
|
|
|
// RegisterEnforceIndexErrPrefix is the prefix to use in errors caused by
|
|
|
|
// enforcing the job modify index during registers.
|
|
|
|
RegisterEnforceIndexErrPrefix = "Enforcing job modify index"
|
2016-11-26 02:04:55 +00:00
|
|
|
|
2016-12-14 20:50:08 +00:00
|
|
|
// DispatchPayloadSizeLimit is the maximum size of the uncompressed input
|
2016-11-26 02:04:55 +00:00
|
|
|
// data payload.
|
2016-12-14 20:50:08 +00:00
|
|
|
DispatchPayloadSizeLimit = 16 * 1024
|
2016-06-08 23:48:02 +00:00
|
|
|
)
|
|
|
|
|
2020-09-10 22:08:25 +00:00
|
|
|
// ErrMultipleNamespaces is send when multiple namespaces are used in the OSS setup
|
2020-09-10 22:18:23 +00:00
|
|
|
var ErrMultipleNamespaces = errors.New("multiple Vault namespaces requires Nomad Enterprise")
|
2020-09-10 22:08:25 +00:00
|
|
|
|
2016-09-01 21:23:40 +00:00
|
|
|
var (
|
2018-05-08 22:26:36 +00:00
|
|
|
// allowRescheduleTransition is the transition that allows failed
|
|
|
|
// allocations to be force rescheduled. We create a one off
|
|
|
|
// variable to avoid creating a new object for every request.
|
|
|
|
allowForceRescheduleTransition = &structs.DesiredTransition{
|
|
|
|
ForceReschedule: helper.BoolToPtr(true),
|
|
|
|
}
|
2016-09-01 21:23:40 +00:00
|
|
|
)
|
|
|
|
|
2015-07-23 21:41:18 +00:00
|
|
|
// Job endpoint is used for job interactions
|
|
|
|
type Job struct {
|
2018-09-15 23:23:13 +00:00
|
|
|
srv *Server
|
|
|
|
logger log.Logger
|
2019-08-15 15:22:37 +00:00
|
|
|
|
|
|
|
// builtin admission controllers
|
|
|
|
mutators []jobMutator
|
|
|
|
validators []jobValidator
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewJobEndpoints creates a new job endpoint with builtin admission controllers
|
|
|
|
func NewJobEndpoints(s *Server) *Job {
|
|
|
|
return &Job{
|
|
|
|
srv: s,
|
|
|
|
logger: s.logger.Named("job"),
|
|
|
|
mutators: []jobMutator{
|
|
|
|
jobCanonicalizer{},
|
2019-12-12 23:46:14 +00:00
|
|
|
jobConnectHook{},
|
connect: enable automatic expose paths for individual group service checks
Part of #6120
Building on the support for enabling connect proxy paths in #7323, this change
adds the ability to configure the 'service.check.expose' flag on group-level
service check definitions for services that are connect-enabled. This is a slight
deviation from the "magic" that Consul provides. With Consul, the 'expose' flag
exists on the connect.proxy stanza, which will then auto-generate expose paths
for every HTTP and gRPC service check associated with that connect-enabled
service.
A first attempt at providing similar magic for Nomad's Consul Connect integration
followed that pattern exactly, as seen in #7396. However, on reviewing the PR
we realized having the `expose` flag on the proxy stanza inseperably ties together
the automatic path generation with every HTTP/gRPC defined on the service. This
makes sense in Consul's context, because a service definition is reasonably
associated with a single "task". With Nomad's group level service definitions
however, there is a reasonable expectation that a service definition is more
abstractly representative of multiple services within the task group. In this
case, one would want to define checks of that service which concretely make HTTP
or gRPC requests to different underlying tasks. Such a model is not possible
with the course `proxy.expose` flag.
Instead, we now have the flag made available within the check definitions themselves.
By making the expose feature resolute to each check, it is possible to have
some HTTP/gRPC checks which make use of the envoy exposed paths, as well as
some HTTP/gRPC checks which make use of some orthongonal port-mapping to do
checks on some other task (or even some other bound port of the same task)
within the task group.
Given this example,
group "server-group" {
network {
mode = "bridge"
port "forchecks" {
to = -1
}
}
service {
name = "myserver"
port = 2000
connect {
sidecar_service {
}
}
check {
name = "mycheck-myserver"
type = "http"
port = "forchecks"
interval = "3s"
timeout = "2s"
method = "GET"
path = "/classic/responder/health"
expose = true
}
}
}
Nomad will automatically inject (via job endpoint mutator) the
extrapolated expose path configuration, i.e.
expose {
path {
path = "/classic/responder/health"
protocol = "http"
local_path_port = 2000
listener_port = "forchecks"
}
}
Documentation is coming in #7440 (needs updating, doing next)
Modifications to the `countdash` examples in https://github.com/hashicorp/demo-consul-101/pull/6
which will make the examples in the documentation actually runnable.
Will add some e2e tests based on the above when it becomes available.
2020-03-25 01:49:55 +00:00
|
|
|
jobExposeCheckHook{},
|
2019-08-15 15:22:37 +00:00
|
|
|
jobImpliedConstraints{},
|
|
|
|
},
|
|
|
|
validators: []jobValidator{
|
|
|
|
jobConnectHook{},
|
connect: enable automatic expose paths for individual group service checks
Part of #6120
Building on the support for enabling connect proxy paths in #7323, this change
adds the ability to configure the 'service.check.expose' flag on group-level
service check definitions for services that are connect-enabled. This is a slight
deviation from the "magic" that Consul provides. With Consul, the 'expose' flag
exists on the connect.proxy stanza, which will then auto-generate expose paths
for every HTTP and gRPC service check associated with that connect-enabled
service.
A first attempt at providing similar magic for Nomad's Consul Connect integration
followed that pattern exactly, as seen in #7396. However, on reviewing the PR
we realized having the `expose` flag on the proxy stanza inseperably ties together
the automatic path generation with every HTTP/gRPC defined on the service. This
makes sense in Consul's context, because a service definition is reasonably
associated with a single "task". With Nomad's group level service definitions
however, there is a reasonable expectation that a service definition is more
abstractly representative of multiple services within the task group. In this
case, one would want to define checks of that service which concretely make HTTP
or gRPC requests to different underlying tasks. Such a model is not possible
with the course `proxy.expose` flag.
Instead, we now have the flag made available within the check definitions themselves.
By making the expose feature resolute to each check, it is possible to have
some HTTP/gRPC checks which make use of the envoy exposed paths, as well as
some HTTP/gRPC checks which make use of some orthongonal port-mapping to do
checks on some other task (or even some other bound port of the same task)
within the task group.
Given this example,
group "server-group" {
network {
mode = "bridge"
port "forchecks" {
to = -1
}
}
service {
name = "myserver"
port = 2000
connect {
sidecar_service {
}
}
check {
name = "mycheck-myserver"
type = "http"
port = "forchecks"
interval = "3s"
timeout = "2s"
method = "GET"
path = "/classic/responder/health"
expose = true
}
}
}
Nomad will automatically inject (via job endpoint mutator) the
extrapolated expose path configuration, i.e.
expose {
path {
path = "/classic/responder/health"
protocol = "http"
local_path_port = 2000
listener_port = "forchecks"
}
}
Documentation is coming in #7440 (needs updating, doing next)
Modifications to the `countdash` examples in https://github.com/hashicorp/demo-consul-101/pull/6
which will make the examples in the documentation actually runnable.
Will add some e2e tests based on the above when it becomes available.
2020-03-25 01:49:55 +00:00
|
|
|
jobExposeCheckHook{},
|
2019-08-15 15:22:37 +00:00
|
|
|
jobValidate{},
|
2021-04-30 02:09:56 +00:00
|
|
|
&memoryOversubscriptionValidate{srv: s},
|
2019-08-15 15:22:37 +00:00
|
|
|
},
|
|
|
|
}
|
2015-07-23 21:41:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Register is used to upsert a job for scheduling
|
2015-08-06 18:48:44 +00:00
|
|
|
func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegisterResponse) error {
|
2015-07-23 21:41:18 +00:00
|
|
|
if done, err := j.srv.forward("Job.Register", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "register"}, time.Now())
|
|
|
|
|
2015-07-23 22:15:48 +00:00
|
|
|
// Validate the arguments
|
|
|
|
if args.Job == nil {
|
|
|
|
return fmt.Errorf("missing job for registration")
|
|
|
|
}
|
2015-12-16 01:30:50 +00:00
|
|
|
|
2019-09-26 20:44:04 +00:00
|
|
|
// defensive check; http layer and RPC requester should ensure namespaces are set consistently
|
2019-09-27 11:40:58 +00:00
|
|
|
if args.RequestNamespace() != args.Job.Namespace {
|
|
|
|
return fmt.Errorf("mismatched request namespace in request: %q, %q", args.RequestNamespace(), args.Job.Namespace)
|
2019-09-26 20:44:04 +00:00
|
|
|
}
|
|
|
|
|
2019-08-15 15:22:37 +00:00
|
|
|
// Run admission controllers
|
|
|
|
job, warnings, err := j.admissionControllers(args.Job)
|
2017-05-10 03:52:47 +00:00
|
|
|
if err != nil {
|
2015-12-16 01:30:50 +00:00
|
|
|
return err
|
|
|
|
}
|
2019-08-15 15:22:37 +00:00
|
|
|
args.Job = job
|
2015-12-16 01:30:50 +00:00
|
|
|
|
2020-06-19 11:53:29 +00:00
|
|
|
// Attach the Nomad token's accessor ID so that deploymentwatcher
|
|
|
|
// can reference the token later
|
|
|
|
tokenID, err := j.srv.ResolveSecretToken(args.AuthToken)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if tokenID != nil {
|
|
|
|
args.Job.NomadTokenID = tokenID.AccessorID
|
|
|
|
}
|
|
|
|
|
2017-07-07 02:08:51 +00:00
|
|
|
// Set the warning message
|
2019-08-15 15:22:37 +00:00
|
|
|
reply.Warnings = structs.MergeMultierrorWarnings(warnings...)
|
2017-07-07 02:08:51 +00:00
|
|
|
|
2017-08-21 04:31:45 +00:00
|
|
|
// Check job submission permissions
|
2022-02-02 20:03:18 +00:00
|
|
|
aclObj, err := j.srv.ResolveToken(args.AuthToken)
|
|
|
|
if err != nil {
|
2017-08-21 04:31:45 +00:00
|
|
|
return err
|
2017-09-19 14:47:10 +00:00
|
|
|
} else if aclObj != nil {
|
2017-11-20 23:12:13 +00:00
|
|
|
if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) {
|
2017-09-19 14:47:10 +00:00
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
// Validate Volume Permissions
|
2019-07-25 14:32:19 +00:00
|
|
|
for _, tg := range args.Job.TaskGroups {
|
|
|
|
for _, vol := range tg.Volumes {
|
2020-03-17 21:32:39 +00:00
|
|
|
switch vol.Type {
|
|
|
|
case structs.VolumeTypeCSI:
|
|
|
|
if !allowCSIMount(aclObj, args.RequestNamespace()) {
|
2019-08-21 18:13:16 +00:00
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
2020-03-17 21:32:39 +00:00
|
|
|
case structs.VolumeTypeHost:
|
|
|
|
// If a volume is readonly, then we allow access if the user has ReadOnly
|
|
|
|
// or ReadWrite access to the volume. Otherwise we only allow access if
|
|
|
|
// they have ReadWrite access.
|
|
|
|
if vol.ReadOnly {
|
|
|
|
if !aclObj.AllowHostVolumeOperation(vol.Source, acl.HostVolumeCapabilityMountReadOnly) &&
|
|
|
|
!aclObj.AllowHostVolumeOperation(vol.Source, acl.HostVolumeCapabilityMountReadWrite) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if !aclObj.AllowHostVolumeOperation(vol.Source, acl.HostVolumeCapabilityMountReadWrite) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
2019-08-21 18:13:16 +00:00
|
|
|
}
|
2020-03-17 21:32:39 +00:00
|
|
|
default:
|
|
|
|
return structs.ErrPermissionDenied
|
2019-07-25 14:32:19 +00:00
|
|
|
}
|
|
|
|
}
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
|
|
|
|
for _, t := range tg.Tasks {
|
|
|
|
for _, vm := range t.VolumeMounts {
|
|
|
|
vol := tg.Volumes[vm.Volume]
|
|
|
|
if vm.PropagationMode == structs.VolumeMountPropagationBidirectional &&
|
|
|
|
!aclObj.AllowHostVolumeOperation(vol.Source, acl.HostVolumeCapabilityMountReadWrite) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
}
|
2020-03-18 19:29:03 +00:00
|
|
|
|
|
|
|
if t.CSIPluginConfig != nil {
|
|
|
|
if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityCSIRegisterPlugin) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
}
|
volumes: Add support for mount propagation
This commit introduces support for configuring mount propagation when
mounting volumes with the `volume_mount` stanza on Linux targets.
Similar to Kubernetes, we expose 3 options for configuring mount
propagation:
- private, which is equivalent to `rprivate` on Linux, which does not allow the
container to see any new nested mounts after the chroot was created.
- host-to-task, which is equivalent to `rslave` on Linux, which allows new mounts
that have been created _outside of the container_ to be visible
inside the container after the chroot is created.
- bidirectional, which is equivalent to `rshared` on Linux, which allows both
the container to see new mounts created on the host, but
importantly _allows the container to create mounts that are
visible in other containers an don the host_
private and host-to-task are safe, but bidirectional mounts can be
dangerous, as if the code inside a container creates a mount, and does
not clean it up before tearing down the container, it can cause bad
things to happen inside the kernel.
To add a layer of safety here, we require that the user has ReadWrite
permissions on the volume before allowing bidirectional mounts, as a
defense in depth / validation case, although creating mounts should also require
a priviliged execution environment inside the container.
2019-09-13 21:13:20 +00:00
|
|
|
}
|
2019-07-25 14:32:19 +00:00
|
|
|
}
|
|
|
|
|
2017-09-19 14:47:10 +00:00
|
|
|
// Check if override is set and we do not have permissions
|
|
|
|
if args.PolicyOverride {
|
2017-11-20 23:12:13 +00:00
|
|
|
if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySentinelOverride) {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Warn("policy override attempted without permissions for job", "job", args.Job.ID)
|
2017-09-19 14:47:10 +00:00
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Warn("policy override set for job", "job", args.Job.ID)
|
2017-09-19 14:47:10 +00:00
|
|
|
}
|
2017-08-21 04:31:45 +00:00
|
|
|
}
|
|
|
|
|
2021-12-06 20:20:34 +00:00
|
|
|
if ok, err := registrationsAreAllowed(aclObj, j.srv.State()); !ok || err != nil {
|
|
|
|
j.logger.Warn("job registration is currently disabled for non-management ACL")
|
|
|
|
return structs.ErrJobRegistrationDisabled
|
|
|
|
}
|
|
|
|
|
2017-06-27 23:08:18 +00:00
|
|
|
// Lookup the job
|
2017-09-19 14:47:10 +00:00
|
|
|
snap, err := j.srv.State().Snapshot()
|
2017-06-27 23:08:18 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
existingJob, err := snap.JobByID(ws, args.RequestNamespace(), args.Job.ID)
|
2017-06-27 23:08:18 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// If EnforceIndex set, check it before trying to apply
|
2016-06-08 23:48:02 +00:00
|
|
|
if args.EnforceIndex {
|
|
|
|
jmi := args.JobModifyIndex
|
2017-06-27 23:08:18 +00:00
|
|
|
if existingJob != nil {
|
2016-06-08 23:48:02 +00:00
|
|
|
if jmi == 0 {
|
|
|
|
return fmt.Errorf("%s 0: job already exists", RegisterEnforceIndexErrPrefix)
|
2017-06-27 23:08:18 +00:00
|
|
|
} else if jmi != existingJob.JobModifyIndex {
|
2016-06-08 23:48:02 +00:00
|
|
|
return fmt.Errorf("%s %d: job exists with conflicting job modify index: %d",
|
2017-06-27 23:08:18 +00:00
|
|
|
RegisterEnforceIndexErrPrefix, jmi, existingJob.JobModifyIndex)
|
2016-06-08 23:48:02 +00:00
|
|
|
}
|
|
|
|
} else if jmi != 0 {
|
|
|
|
return fmt.Errorf("%s %d: job does not exist", RegisterEnforceIndexErrPrefix, jmi)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-27 23:08:18 +00:00
|
|
|
// Validate job transitions if its an update
|
2018-06-11 15:59:03 +00:00
|
|
|
if err := validateJobUpdate(existingJob, args.Job); err != nil {
|
|
|
|
return err
|
2017-06-27 23:08:18 +00:00
|
|
|
}
|
|
|
|
|
2020-02-21 21:23:30 +00:00
|
|
|
// Ensure that all scaling policies have an appropriate ID
|
|
|
|
if err := propagateScalingPolicyIDs(existingJob, args.Job); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-08-17 00:50:14 +00:00
|
|
|
// Ensure that the job has permissions for the requested Vault tokens
|
2016-09-01 21:23:40 +00:00
|
|
|
policies := args.Job.VaultPolicies()
|
|
|
|
if len(policies) != 0 {
|
2016-08-17 00:50:14 +00:00
|
|
|
vconf := j.srv.config.VaultConfig
|
2016-10-11 01:04:39 +00:00
|
|
|
if !vconf.IsEnabled() {
|
2016-08-17 00:50:14 +00:00
|
|
|
return fmt.Errorf("Vault not enabled and Vault policies requested")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Have to check if the user has permissions
|
2016-10-11 01:04:39 +00:00
|
|
|
if !vconf.AllowsUnauthenticated() {
|
2016-08-17 00:50:14 +00:00
|
|
|
if args.Job.VaultToken == "" {
|
|
|
|
return fmt.Errorf("Vault policies requested but missing Vault Token")
|
|
|
|
}
|
|
|
|
|
|
|
|
vault := j.srv.vault
|
2016-08-18 20:52:15 +00:00
|
|
|
s, err := vault.LookupToken(context.Background(), args.Job.VaultToken)
|
2016-08-17 00:50:14 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
allowedPolicies, err := PoliciesFrom(s)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-07-17 14:41:45 +00:00
|
|
|
// Check Namespaces
|
|
|
|
namespaceErr := j.multiVaultNamespaceValidation(policies, s)
|
|
|
|
if namespaceErr != nil {
|
|
|
|
return namespaceErr
|
|
|
|
}
|
|
|
|
|
2016-09-01 19:05:08 +00:00
|
|
|
// If we are given a root token it can access all policies
|
|
|
|
if !lib.StrContains(allowedPolicies, "root") {
|
2016-09-01 21:23:40 +00:00
|
|
|
flatPolicies := structs.VaultPoliciesSet(policies)
|
2017-01-18 23:55:14 +00:00
|
|
|
subset, offending := helper.SliceStringIsSubset(allowedPolicies, flatPolicies)
|
2016-09-01 19:05:08 +00:00
|
|
|
if !subset {
|
|
|
|
return fmt.Errorf("Passed Vault Token doesn't allow access to the following policies: %s",
|
|
|
|
strings.Join(offending, ", "))
|
|
|
|
}
|
2016-08-17 00:50:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// helper function that checks if the Consul token supplied with the job has
|
|
|
|
// sufficient ACL permissions for:
|
|
|
|
// - registering services into namespace of each group
|
|
|
|
// - reading kv store of each group
|
|
|
|
// - establishing consul connect services
|
|
|
|
checkConsulToken := func(usages map[string]*structs.ConsulUsage) error {
|
2019-12-06 20:46:46 +00:00
|
|
|
if j.srv.config.ConsulConfig.AllowsUnauthenticated() {
|
|
|
|
// if consul.allow_unauthenticated is enabled (which is the default)
|
2021-03-16 18:22:21 +00:00
|
|
|
// just let the job through without checking anything
|
2019-12-06 20:46:46 +00:00
|
|
|
return nil
|
|
|
|
}
|
2020-08-27 16:53:41 +00:00
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
ctx := context.Background()
|
2021-03-16 18:22:21 +00:00
|
|
|
for namespace, usage := range usages {
|
|
|
|
if err := j.srv.consulACLs.CheckPermissions(ctx, namespace, usage, args.Job.ConsulToken); err != nil {
|
|
|
|
return errors.Wrap(err, "job-submitter consul token denied")
|
|
|
|
}
|
2019-12-06 20:46:46 +00:00
|
|
|
}
|
2021-03-16 18:22:21 +00:00
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-03-16 18:22:21 +00:00
|
|
|
// Enforce the job-submitter has a Consul token with necessary ACL permissions.
|
|
|
|
if err := checkConsulToken(args.Job.ConsulUsages()); err != nil {
|
|
|
|
return err
|
2019-12-06 20:46:46 +00:00
|
|
|
}
|
|
|
|
|
2020-07-28 20:12:08 +00:00
|
|
|
// Create or Update Consul Configuration Entries defined in the job. For now
|
2021-04-19 17:29:36 +00:00
|
|
|
// Nomad only supports Configuration Entries types
|
|
|
|
// - "ingress-gateway" for managing Ingress Gateways
|
|
|
|
// - "terminating-gateway" for managing Terminating Gateways
|
2020-07-28 20:12:08 +00:00
|
|
|
//
|
|
|
|
// This is done as a blocking operation that prevents the job from being
|
|
|
|
// submitted if the configuration entries cannot be set in Consul.
|
|
|
|
//
|
|
|
|
// Every job update will re-write the Configuration Entry into Consul.
|
2020-08-26 19:23:31 +00:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
|
|
defer cancel()
|
2021-04-19 17:29:36 +00:00
|
|
|
|
|
|
|
for ns, entries := range args.Job.ConfigEntries() {
|
|
|
|
for service, entry := range entries.Ingress {
|
|
|
|
if errCE := j.srv.consulConfigEntries.SetIngressCE(ctx, ns, service, entry); errCE != nil {
|
|
|
|
return errCE
|
|
|
|
}
|
2020-07-28 20:12:08 +00:00
|
|
|
}
|
2021-04-19 17:29:36 +00:00
|
|
|
for service, entry := range entries.Terminating {
|
|
|
|
if errCE := j.srv.consulConfigEntries.SetTerminatingCE(ctx, ns, service, entry); errCE != nil {
|
|
|
|
return errCE
|
|
|
|
}
|
2020-12-15 20:38:33 +00:00
|
|
|
}
|
|
|
|
}
|
2020-07-28 20:12:08 +00:00
|
|
|
|
2019-06-04 15:48:49 +00:00
|
|
|
// Enforce Sentinel policies. Pass a copy of the job to prevent
|
|
|
|
// sentinel from altering it.
|
|
|
|
policyWarnings, err := j.enforceSubmitJob(args.PolicyOverride, args.Job.Copy())
|
2017-09-19 14:47:10 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if policyWarnings != nil {
|
2019-08-15 15:22:37 +00:00
|
|
|
warnings = append(warnings, policyWarnings)
|
|
|
|
reply.Warnings = structs.MergeMultierrorWarnings(warnings...)
|
2017-09-19 14:47:10 +00:00
|
|
|
}
|
|
|
|
|
2016-08-17 00:50:14 +00:00
|
|
|
// Clear the Vault token
|
|
|
|
args.Job.VaultToken = ""
|
|
|
|
|
2020-02-12 15:58:51 +00:00
|
|
|
// Clear the Consul token
|
|
|
|
args.Job.ConsulToken = ""
|
|
|
|
|
2020-06-16 17:52:47 +00:00
|
|
|
// Preserve the existing task group counts, if so requested
|
|
|
|
if existingJob != nil && args.PreserveCounts {
|
|
|
|
prevCounts := make(map[string]int)
|
|
|
|
for _, tg := range existingJob.TaskGroups {
|
|
|
|
prevCounts[tg.Name] = tg.Count
|
|
|
|
}
|
|
|
|
for _, tg := range args.Job.TaskGroups {
|
|
|
|
if count, ok := prevCounts[tg.Name]; ok {
|
|
|
|
tg.Count = count
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-15 20:18:14 +00:00
|
|
|
// Submit a multiregion job to other regions (enterprise only).
|
|
|
|
// The job will have its region interpolated.
|
2020-10-12 17:59:48 +00:00
|
|
|
var newVersion uint64
|
2020-07-03 14:44:41 +00:00
|
|
|
if existingJob != nil {
|
2020-10-12 17:59:48 +00:00
|
|
|
newVersion = existingJob.Version + 1
|
2020-07-03 14:44:41 +00:00
|
|
|
}
|
2020-10-12 17:59:48 +00:00
|
|
|
isRunner, err := j.multiregionRegister(args, reply, newVersion)
|
2020-06-15 20:18:14 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
// Create a new evaluation
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
submittedEval := false
|
2020-07-15 15:10:57 +00:00
|
|
|
var eval *structs.Evaluation
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
|
|
|
|
// Set the submit time
|
2020-07-15 12:49:17 +00:00
|
|
|
args.Job.SubmitTime = now
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
|
|
|
|
// If the job is periodic or parameterized, we don't create an eval.
|
|
|
|
if !(args.Job.IsPeriodic() || args.Job.IsParameterized()) {
|
2021-11-23 08:23:31 +00:00
|
|
|
|
|
|
|
// Initially set the eval priority to that of the job priority. If the
|
|
|
|
// user supplied an eval priority override, we subsequently use this.
|
|
|
|
evalPriority := args.Job.Priority
|
|
|
|
if args.EvalPriority > 0 {
|
|
|
|
evalPriority = args.EvalPriority
|
|
|
|
}
|
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
eval = &structs.Evaluation{
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: args.RequestNamespace(),
|
2021-11-23 08:23:31 +00:00
|
|
|
Priority: evalPriority,
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
Type: args.Job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerJobRegister,
|
|
|
|
JobID: args.Job.ID,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
|
|
|
}
|
2020-07-15 15:10:57 +00:00
|
|
|
reply.EvalID = eval.ID
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
}
|
|
|
|
|
2017-05-23 00:02:20 +00:00
|
|
|
// Check if the job has changed at all
|
2017-07-01 00:23:34 +00:00
|
|
|
if existingJob == nil || existingJob.SpecChanged(args.Job) {
|
2017-06-30 02:08:25 +00:00
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
// COMPAT(1.1.0): Remove the ServerMeetMinimumVersion check to always set args.Eval
|
|
|
|
// 0.12.1 introduced atomic eval job registration
|
|
|
|
if eval != nil && ServersMeetMinimumVersion(j.srv.Members(), minJobRegisterAtomicEvalVersion, false) {
|
|
|
|
args.Eval = eval
|
|
|
|
submittedEval = true
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:02:20 +00:00
|
|
|
// Commit this update via Raft
|
2017-09-19 14:47:10 +00:00
|
|
|
fsmErr, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args)
|
|
|
|
if err, ok := fsmErr.(error); ok && err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("registering job failed", "error", err, "fsm", true)
|
2017-09-19 14:47:10 +00:00
|
|
|
return err
|
|
|
|
}
|
2017-05-23 00:02:20 +00:00
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("registering job failed", "error", err, "raft", true)
|
2017-05-23 00:02:20 +00:00
|
|
|
return err
|
|
|
|
}
|
2015-07-23 21:41:18 +00:00
|
|
|
|
2017-05-23 00:02:20 +00:00
|
|
|
// Populate the reply with job information
|
|
|
|
reply.JobModifyIndex = index
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
reply.Index = index
|
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
if submittedEval {
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
reply.EvalCreateIndex = index
|
|
|
|
}
|
|
|
|
|
2017-05-23 00:02:20 +00:00
|
|
|
} else {
|
2017-07-01 00:23:34 +00:00
|
|
|
reply.JobModifyIndex = existingJob.JobModifyIndex
|
2017-05-23 00:02:20 +00:00
|
|
|
}
|
2015-12-01 19:40:40 +00:00
|
|
|
|
2020-07-14 14:57:37 +00:00
|
|
|
// used for multiregion start
|
|
|
|
args.Job.JobModifyIndex = reply.JobModifyIndex
|
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
if eval == nil {
|
2020-08-27 18:54:45 +00:00
|
|
|
// For dispatch jobs we return early, so we need to drop regions
|
|
|
|
// here rather than after eval for deployments is kicked off
|
|
|
|
err = j.multiregionDrop(args, reply)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-12-01 19:40:40 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
if eval != nil && !submittedEval {
|
|
|
|
eval.JobModifyIndex = reply.JobModifyIndex
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
update := &structs.EvalUpdateRequest{
|
2020-07-15 15:10:57 +00:00
|
|
|
Evals: []*structs.Evaluation{eval},
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
WriteRequest: structs.WriteRequest{Region: args.Region},
|
|
|
|
}
|
2015-08-06 18:48:44 +00:00
|
|
|
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
// Commit this evaluation via Raft
|
|
|
|
// There is a risk of partial failure where the JobRegister succeeds
|
|
|
|
// but that the EvalUpdate does not, before 0.12.1
|
|
|
|
_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
|
|
|
|
if err != nil {
|
|
|
|
j.logger.Error("eval create failed", "error", err, "method", "register")
|
|
|
|
return err
|
|
|
|
}
|
2015-08-06 18:48:44 +00:00
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
reply.EvalCreateIndex = evalIndex
|
|
|
|
reply.Index = evalIndex
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
}
|
2020-07-14 14:57:37 +00:00
|
|
|
|
|
|
|
// Kick off a multiregion deployment (enterprise only).
|
|
|
|
if isRunner {
|
|
|
|
err = j.multiregionStart(args, reply)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-08-27 18:54:45 +00:00
|
|
|
// We drop any unwanted regions only once we know all jobs have
|
|
|
|
// been registered and we've kicked off the deployment. This keeps
|
|
|
|
// dropping regions close in semantics to dropping task groups in
|
|
|
|
// single-region deployments
|
|
|
|
err = j.multiregionDrop(args, reply)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-07-14 14:57:37 +00:00
|
|
|
}
|
|
|
|
|
2015-07-23 21:41:18 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-02-21 21:23:30 +00:00
|
|
|
// propagateScalingPolicyIDs propagates scaling policy IDs from existing job
|
|
|
|
// to updated job, or generates random IDs in new job
|
|
|
|
func propagateScalingPolicyIDs(old, new *structs.Job) error {
|
|
|
|
|
|
|
|
oldIDs := make(map[string]string)
|
|
|
|
if old != nil {
|
2020-09-09 22:30:40 +00:00
|
|
|
// use the job-scoped key (includes type, group, and task) to uniquely
|
|
|
|
// identify policies in a job
|
2020-02-21 21:23:30 +00:00
|
|
|
for _, p := range old.GetScalingPolicies() {
|
2020-09-09 22:30:40 +00:00
|
|
|
oldIDs[p.JobKey()] = p.ID
|
2020-02-21 21:23:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// ignore any existing ID in the policy, they should be empty
|
|
|
|
for _, p := range new.GetScalingPolicies() {
|
2020-09-09 22:30:40 +00:00
|
|
|
if id, ok := oldIDs[p.JobKey()]; ok {
|
2020-02-21 21:23:30 +00:00
|
|
|
p.ID = id
|
|
|
|
} else {
|
|
|
|
p.ID = uuid.Generate()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-10-20 20:55:35 +00:00
|
|
|
// getSignalConstraint builds a suitable constraint based on the required
|
|
|
|
// signals
|
|
|
|
func getSignalConstraint(signals []string) *structs.Constraint {
|
2018-04-25 20:49:58 +00:00
|
|
|
sort.Strings(signals)
|
2016-10-20 20:55:35 +00:00
|
|
|
return &structs.Constraint{
|
|
|
|
Operand: structs.ConstraintSetContains,
|
|
|
|
LTarget: "${attr.os.signals}",
|
|
|
|
RTarget: strings.Join(signals, ","),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-13 17:36:03 +00:00
|
|
|
// Summary retrieves the summary of a job
|
2016-07-25 21:33:39 +00:00
|
|
|
func (j *Job) Summary(args *structs.JobSummaryRequest,
|
2016-07-21 21:43:21 +00:00
|
|
|
reply *structs.JobSummaryResponse) error {
|
2017-09-12 15:56:55 +00:00
|
|
|
|
2016-07-25 21:33:39 +00:00
|
|
|
if done, err := j.srv.forward("Job.Summary", args, args, reply); done {
|
2016-07-21 20:04:38 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job_summary", "get_job_summary"}, time.Now())
|
2017-09-14 14:52:50 +00:00
|
|
|
|
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-14 14:52:50 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2016-07-21 20:04:38 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
2017-02-08 04:31:23 +00:00
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
2016-07-21 21:43:21 +00:00
|
|
|
// Look for job summary
|
2017-09-07 23:56:15 +00:00
|
|
|
out, err := state.JobSummaryByID(ws, args.RequestNamespace(), args.JobID)
|
2016-07-21 20:04:38 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup the output
|
|
|
|
reply.JobSummary = out
|
|
|
|
if out != nil {
|
|
|
|
reply.Index = out.ModifyIndex
|
|
|
|
} else {
|
2016-07-22 06:13:07 +00:00
|
|
|
// Use the last index that affected the job_summary table
|
2017-02-08 04:31:23 +00:00
|
|
|
index, err := state.Index("job_summary")
|
2016-07-21 20:04:38 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
|
|
|
}
|
|
|
|
|
2017-02-06 19:48:28 +00:00
|
|
|
// Validate validates a job
|
2017-04-18 20:09:24 +00:00
|
|
|
func (j *Job) Validate(args *structs.JobValidateRequest, reply *structs.JobValidateResponse) error {
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "validate"}, time.Now())
|
2017-02-06 19:48:28 +00:00
|
|
|
|
2019-09-26 20:44:04 +00:00
|
|
|
// defensive check; http layer and RPC requester should ensure namespaces are set consistently
|
2019-09-27 11:40:58 +00:00
|
|
|
if args.RequestNamespace() != args.Job.Namespace {
|
|
|
|
return fmt.Errorf("mismatched request namespace in request: %q, %q", args.RequestNamespace(), args.Job.Namespace)
|
2019-09-26 20:44:04 +00:00
|
|
|
}
|
|
|
|
|
2019-08-15 15:22:37 +00:00
|
|
|
job, mutateWarnings, err := j.admissionMutators(args.Job)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
args.Job = job
|
|
|
|
|
2017-09-25 17:30:31 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-25 17:30:31 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2017-07-07 02:08:51 +00:00
|
|
|
// Validate the job and capture any warnings
|
2019-08-15 15:22:37 +00:00
|
|
|
validateWarnings, err := j.admissionValidators(args.Job)
|
2017-05-10 03:52:47 +00:00
|
|
|
if err != nil {
|
2017-02-06 19:48:28 +00:00
|
|
|
if merr, ok := err.(*multierror.Error); ok {
|
|
|
|
for _, err := range merr.Errors {
|
|
|
|
reply.ValidationErrors = append(reply.ValidationErrors, err.Error())
|
|
|
|
}
|
2017-03-03 23:00:39 +00:00
|
|
|
reply.Error = merr.Error()
|
2017-02-06 19:48:28 +00:00
|
|
|
} else {
|
|
|
|
reply.ValidationErrors = append(reply.ValidationErrors, err.Error())
|
2017-03-03 23:00:39 +00:00
|
|
|
reply.Error = err.Error()
|
2017-02-06 19:48:28 +00:00
|
|
|
}
|
|
|
|
}
|
2017-04-18 20:09:24 +00:00
|
|
|
|
2019-08-15 15:22:37 +00:00
|
|
|
validateWarnings = append(validateWarnings, mutateWarnings...)
|
|
|
|
|
2017-07-07 02:08:51 +00:00
|
|
|
// Set the warning message
|
2019-08-15 15:22:37 +00:00
|
|
|
reply.Warnings = structs.MergeMultierrorWarnings(validateWarnings...)
|
2017-02-06 19:48:28 +00:00
|
|
|
reply.DriverConfigValidated = true
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-04-18 22:11:33 +00:00
|
|
|
// Revert is used to revert the job to a prior version
|
|
|
|
func (j *Job) Revert(args *structs.JobRevertRequest, reply *structs.JobRegisterResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Revert", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "revert"}, time.Now())
|
|
|
|
|
2017-09-25 21:36:22 +00:00
|
|
|
// Check for submit-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-25 21:36:22 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2017-04-18 22:11:33 +00:00
|
|
|
// Validate the arguments
|
|
|
|
if args.JobID == "" {
|
2017-07-06 19:49:13 +00:00
|
|
|
return fmt.Errorf("missing job ID for revert")
|
2017-04-18 22:11:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Lookup the job by version
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
cur, err := snap.JobByID(ws, args.RequestNamespace(), args.JobID)
|
2017-04-19 20:28:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if cur == nil {
|
|
|
|
return fmt.Errorf("job %q not found", args.JobID)
|
|
|
|
}
|
|
|
|
if args.JobVersion == cur.Version {
|
|
|
|
return fmt.Errorf("can't revert to current version")
|
|
|
|
}
|
|
|
|
|
2017-09-07 23:56:15 +00:00
|
|
|
jobV, err := snap.JobByIDAndVersion(ws, args.RequestNamespace(), args.JobID, args.JobVersion)
|
2017-04-27 17:51:28 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if jobV == nil {
|
2017-09-07 23:56:15 +00:00
|
|
|
return fmt.Errorf("job %q in namespace %q at version %d not found", args.JobID, args.RequestNamespace(), args.JobVersion)
|
2017-04-27 17:51:28 +00:00
|
|
|
}
|
|
|
|
|
2017-04-18 22:11:33 +00:00
|
|
|
// Build the register request
|
2019-03-25 23:12:39 +00:00
|
|
|
revJob := jobV.Copy()
|
2019-03-28 18:56:12 +00:00
|
|
|
// Use Vault Token from revert request to perform registration of reverted job.
|
2019-03-25 23:12:39 +00:00
|
|
|
revJob.VaultToken = args.VaultToken
|
2017-04-18 22:11:33 +00:00
|
|
|
reg := &structs.JobRegisterRequest{
|
2019-03-25 23:12:39 +00:00
|
|
|
Job: revJob,
|
2017-04-18 22:11:33 +00:00
|
|
|
WriteRequest: args.WriteRequest,
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the request is enforcing the existing version do a check.
|
|
|
|
if args.EnforcePriorVersion != nil {
|
|
|
|
if cur.Version != *args.EnforcePriorVersion {
|
|
|
|
return fmt.Errorf("Current job has version %d; enforcing version %d", cur.Version, *args.EnforcePriorVersion)
|
|
|
|
}
|
|
|
|
|
|
|
|
reg.EnforceIndex = true
|
|
|
|
reg.JobModifyIndex = cur.JobModifyIndex
|
|
|
|
}
|
|
|
|
|
|
|
|
// Register the version.
|
|
|
|
return j.Register(reg, reply)
|
|
|
|
}
|
|
|
|
|
2017-07-06 19:49:13 +00:00
|
|
|
// Stable is used to mark the job version as stable
|
|
|
|
func (j *Job) Stable(args *structs.JobStabilityRequest, reply *structs.JobStabilityResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Stable", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "stable"}, time.Now())
|
|
|
|
|
2017-09-25 22:17:58 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-25 22:17:58 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2017-07-06 19:49:13 +00:00
|
|
|
// Validate the arguments
|
|
|
|
if args.JobID == "" {
|
|
|
|
return fmt.Errorf("missing job ID for marking job as stable")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Lookup the job by version
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
jobV, err := snap.JobByIDAndVersion(ws, args.RequestNamespace(), args.JobID, args.JobVersion)
|
2017-07-06 19:49:13 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if jobV == nil {
|
2017-09-07 23:56:15 +00:00
|
|
|
return fmt.Errorf("job %q in namespace %q at version %d not found", args.JobID, args.RequestNamespace(), args.JobVersion)
|
2017-07-06 19:49:13 +00:00
|
|
|
}
|
|
|
|
|
2017-07-06 22:19:07 +00:00
|
|
|
// Commit this stability request via Raft
|
2017-07-06 19:49:13 +00:00
|
|
|
_, modifyIndex, err := j.srv.raftApply(structs.JobStabilityRequestType, args)
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("submitting job stability request failed", "error", err)
|
2017-07-06 19:49:13 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup the reply
|
|
|
|
reply.Index = modifyIndex
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-08-16 01:11:26 +00:00
|
|
|
// Evaluate is used to force a job for re-evaluation
|
|
|
|
func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegisterResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Evaluate", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "evaluate"}, time.Now())
|
|
|
|
|
2017-09-26 16:05:17 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-26 16:05:17 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2015-08-16 01:11:26 +00:00
|
|
|
// Validate the arguments
|
|
|
|
if args.JobID == "" {
|
|
|
|
return fmt.Errorf("missing job ID for evaluation")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Lookup the job
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
job, err := snap.JobByID(ws, args.RequestNamespace(), args.JobID)
|
2015-08-16 01:11:26 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if job == nil {
|
|
|
|
return fmt.Errorf("job not found")
|
|
|
|
}
|
|
|
|
|
2015-12-01 19:40:40 +00:00
|
|
|
if job.IsPeriodic() {
|
|
|
|
return fmt.Errorf("can't evaluate periodic job")
|
2017-01-20 18:33:52 +00:00
|
|
|
} else if job.IsParameterized() {
|
|
|
|
return fmt.Errorf("can't evaluate parameterized job")
|
2015-12-01 19:40:40 +00:00
|
|
|
}
|
|
|
|
|
2018-05-08 22:26:36 +00:00
|
|
|
forceRescheduleAllocs := make(map[string]*structs.DesiredTransition)
|
2018-05-09 01:00:06 +00:00
|
|
|
|
2018-05-08 22:26:36 +00:00
|
|
|
if args.EvalOptions.ForceReschedule {
|
|
|
|
// Find any failed allocs that could be force rescheduled
|
|
|
|
allocs, err := snap.AllocsByJob(ws, args.RequestNamespace(), args.JobID, false)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, alloc := range allocs {
|
|
|
|
taskGroup := job.LookupTaskGroup(alloc.TaskGroup)
|
|
|
|
// Forcing rescheduling is only allowed if task group has rescheduling enabled
|
2018-05-10 19:42:24 +00:00
|
|
|
if taskGroup == nil || !taskGroup.ReschedulePolicy.Enabled() {
|
2018-05-08 22:26:36 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-05-10 19:42:24 +00:00
|
|
|
if alloc.NextAllocation == "" && alloc.ClientStatus == structs.AllocClientStatusFailed && !alloc.DesiredTransition.ShouldForceReschedule() {
|
2018-05-08 22:26:36 +00:00
|
|
|
forceRescheduleAllocs[alloc.ID] = allowForceRescheduleTransition
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-08-16 01:11:26 +00:00
|
|
|
// Create a new evaluation
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
2015-08-16 01:11:26 +00:00
|
|
|
eval := &structs.Evaluation{
|
2017-09-29 16:58:48 +00:00
|
|
|
ID: uuid.Generate(),
|
2017-09-07 23:56:15 +00:00
|
|
|
Namespace: args.RequestNamespace(),
|
2015-08-16 01:11:26 +00:00
|
|
|
Priority: job.Priority,
|
|
|
|
Type: job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerJobRegister,
|
|
|
|
JobID: job.ID,
|
|
|
|
JobModifyIndex: job.ModifyIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
2019-08-07 16:50:35 +00:00
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
2015-08-16 01:11:26 +00:00
|
|
|
}
|
2018-05-08 22:26:36 +00:00
|
|
|
|
|
|
|
// Create a AllocUpdateDesiredTransitionRequest request with the eval and any forced rescheduled allocs
|
|
|
|
updateTransitionReq := &structs.AllocUpdateDesiredTransitionRequest{
|
|
|
|
Allocs: forceRescheduleAllocs,
|
|
|
|
Evals: []*structs.Evaluation{eval},
|
2015-08-16 01:11:26 +00:00
|
|
|
}
|
2018-05-08 22:26:36 +00:00
|
|
|
_, evalIndex, err := j.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, updateTransitionReq)
|
2015-08-16 01:11:26 +00:00
|
|
|
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("eval create failed", "error", err, "method", "evaluate")
|
2015-08-16 01:11:26 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup the reply
|
|
|
|
reply.EvalID = eval.ID
|
|
|
|
reply.EvalCreateIndex = evalIndex
|
|
|
|
reply.JobModifyIndex = job.ModifyIndex
|
|
|
|
reply.Index = evalIndex
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-07-23 21:41:18 +00:00
|
|
|
// Deregister is used to remove a job the cluster.
|
2015-08-06 21:17:18 +00:00
|
|
|
func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobDeregisterResponse) error {
|
2015-07-23 21:41:18 +00:00
|
|
|
if done, err := j.srv.forward("Job.Deregister", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "deregister"}, time.Now())
|
|
|
|
|
2017-09-27 19:19:14 +00:00
|
|
|
// Check for submit-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-27 19:19:14 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2015-12-01 19:40:40 +00:00
|
|
|
// Validate the arguments
|
|
|
|
if args.JobID == "" {
|
2017-07-06 19:49:13 +00:00
|
|
|
return fmt.Errorf("missing job ID for deregistering")
|
2015-12-01 19:40:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Lookup the job
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
job, err := snap.JobByID(ws, args.RequestNamespace(), args.JobID)
|
2015-12-01 19:40:40 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
var eval *structs.Evaluation
|
|
|
|
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
// The job priority / type is strange for this, since it's not a high
|
|
|
|
// priority even if the job was.
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
2020-07-15 15:10:57 +00:00
|
|
|
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
// If the job is periodic or parameterized, we don't create an eval.
|
|
|
|
if job == nil || !(job.IsPeriodic() || job.IsParameterized()) {
|
2021-11-23 08:23:31 +00:00
|
|
|
|
|
|
|
// The evaluation priority is determined by several factors. It
|
|
|
|
// defaults to the job default priority and is overridden by the
|
|
|
|
// priority set on the job specification.
|
|
|
|
//
|
|
|
|
// If the user supplied an eval priority override, we subsequently
|
|
|
|
// use this.
|
|
|
|
priority := structs.JobDefaultPriority
|
|
|
|
if job != nil {
|
|
|
|
priority = job.Priority
|
|
|
|
}
|
|
|
|
if args.EvalPriority > 0 {
|
|
|
|
priority = args.EvalPriority
|
|
|
|
}
|
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
eval = &structs.Evaluation{
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: args.RequestNamespace(),
|
2021-11-02 08:11:44 +00:00
|
|
|
Priority: priority,
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
Type: structs.JobTypeService,
|
|
|
|
TriggeredBy: structs.EvalTriggerJobDeregister,
|
|
|
|
JobID: args.JobID,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
|
|
|
}
|
2020-07-15 15:10:57 +00:00
|
|
|
reply.EvalID = eval.ID
|
|
|
|
}
|
|
|
|
|
|
|
|
// COMPAT(1.1.0): remove conditional and always set args.Eval
|
|
|
|
if ServersMeetMinimumVersion(j.srv.Members(), minJobRegisterAtomicEvalVersion, false) {
|
|
|
|
args.Eval = eval
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
}
|
|
|
|
|
2020-05-20 19:22:51 +00:00
|
|
|
// Commit the job update via Raft
|
2015-07-23 21:41:18 +00:00
|
|
|
_, index, err := j.srv.raftApply(structs.JobDeregisterRequestType, args)
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("deregister failed", "error", err)
|
2015-07-23 21:41:18 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2015-12-01 19:40:40 +00:00
|
|
|
// Populate the reply with job information
|
|
|
|
reply.JobModifyIndex = index
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
reply.EvalCreateIndex = index
|
|
|
|
reply.Index = index
|
2015-12-01 19:40:40 +00:00
|
|
|
|
2020-07-15 15:10:57 +00:00
|
|
|
// COMPAT(1.1.0) - Remove entire conditional block
|
|
|
|
// 0.12.1 introduced atomic job deregistration eval
|
|
|
|
if eval != nil && args.Eval == nil {
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
// Create a new evaluation
|
2020-07-15 15:10:57 +00:00
|
|
|
eval.JobModifyIndex = index
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
update := &structs.EvalUpdateRequest{
|
2020-07-15 15:10:57 +00:00
|
|
|
Evals: []*structs.Evaluation{eval},
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
WriteRequest: structs.WriteRequest{Region: args.Region},
|
|
|
|
}
|
2020-04-05 14:47:40 +00:00
|
|
|
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
// Commit this evaluation via Raft
|
2020-07-15 15:10:57 +00:00
|
|
|
_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
if err != nil {
|
|
|
|
j.logger.Error("eval create failed", "error", err, "method", "deregister")
|
2020-08-06 18:51:46 +00:00
|
|
|
return err
|
Atomic eval insertion with job (de-)registration
This fixes a bug where jobs may get "stuck" unprocessed that
dispropotionately affect periodic jobs around leadership transitions.
When registering a job, the job registration and the eval to process it
get applied to raft as two separate transactions; if the job
registration succeeds but eval application fails, the job may remain
unprocessed. Operators may detect such failure, when submitting a job
update and get a 500 error code, and they could retry; periodic jobs
failures are more likely to go unnoticed, and no further periodic
invocations will be processed until an operator force evaluation.
This fixes the issue by ensuring that the job registration and eval
application get persisted and processed atomically in the same raft log
entry.
Also, applies the same change to ensure atomicity in job deregistration.
Backward Compatibility
We must maintain compatibility in two scenarios: mixed clusters where a
leader can handle atomic updates but followers cannot, and a recent
cluster processes old log entries from legacy or mixed cluster mode.
To handle this constraints: ensure that the leader continue to emit the
Evaluation log entry until all servers have upgraded; also, when
processing raft logs, the servers honor evaluations found in both spots,
the Eval in job (de-)registration and the eval update entries.
When an updated server sees mix-mode behavior where an eval is inserted
into the raft log twice, it ignores the second instance.
I made one compromise in consistency in the mixed-mode scenario: servers
may disagree on the eval.CreateIndex value: the leader and updated
servers will report the job registration index while old servers will
report the index of the eval update log entry. This discripency doesn't
seem to be material - it's the eval.JobModifyIndex that matters.
2020-07-10 17:31:55 +00:00
|
|
|
}
|
2020-07-15 15:10:57 +00:00
|
|
|
|
|
|
|
reply.EvalCreateIndex = evalIndex
|
|
|
|
reply.Index = evalIndex
|
2015-08-06 21:17:18 +00:00
|
|
|
}
|
|
|
|
|
2020-08-28 18:28:13 +00:00
|
|
|
err = j.multiregionStop(job, args, reply)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-08-06 18:51:46 +00:00
|
|
|
return nil
|
2015-07-23 21:41:18 +00:00
|
|
|
}
|
|
|
|
|
2018-03-14 22:32:18 +00:00
|
|
|
// BatchDeregister is used to remove a set of jobs from the cluster.
|
|
|
|
func (j *Job) BatchDeregister(args *structs.JobBatchDeregisterRequest, reply *structs.JobBatchDeregisterResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.BatchDeregister", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "batch_deregister"}, time.Now())
|
|
|
|
|
|
|
|
// Resolve the ACL token
|
|
|
|
aclObj, err := j.srv.ResolveToken(args.AuthToken)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate the arguments
|
|
|
|
if len(args.Jobs) == 0 {
|
|
|
|
return fmt.Errorf("given no jobs to deregister")
|
|
|
|
}
|
|
|
|
if len(args.Evals) != 0 {
|
|
|
|
return fmt.Errorf("evaluations should not be populated")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Loop through checking for permissions
|
|
|
|
for jobNS := range args.Jobs {
|
|
|
|
// Check for submit-job permissions
|
|
|
|
if aclObj != nil && !aclObj.AllowNsOp(jobNS.Namespace, acl.NamespaceCapabilitySubmitJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-16 17:52:19 +00:00
|
|
|
// Grab a snapshot
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-03-14 22:32:18 +00:00
|
|
|
// Loop through to create evals
|
|
|
|
for jobNS, options := range args.Jobs {
|
|
|
|
if options == nil {
|
|
|
|
return fmt.Errorf("no deregister options provided for %v", jobNS)
|
|
|
|
}
|
|
|
|
|
|
|
|
job, err := snap.JobByID(nil, jobNS.Namespace, jobNS.ID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the job is periodic or parameterized, we don't create an eval.
|
|
|
|
if job != nil && (job.IsPeriodic() || job.IsParameterized()) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
priority := structs.JobDefaultPriority
|
|
|
|
jtype := structs.JobTypeService
|
|
|
|
if job != nil {
|
|
|
|
priority = job.Priority
|
|
|
|
jtype = job.Type
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a new evaluation
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
2018-03-14 22:32:18 +00:00
|
|
|
eval := &structs.Evaluation{
|
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: jobNS.Namespace,
|
|
|
|
Priority: priority,
|
|
|
|
Type: jtype,
|
|
|
|
TriggeredBy: structs.EvalTriggerJobDeregister,
|
|
|
|
JobID: jobNS.ID,
|
|
|
|
Status: structs.EvalStatusPending,
|
2019-08-07 16:50:35 +00:00
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
2018-03-14 22:32:18 +00:00
|
|
|
}
|
|
|
|
args.Evals = append(args.Evals, eval)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Commit this update via Raft
|
|
|
|
_, index, err := j.srv.raftApply(structs.JobBatchDeregisterRequestType, args)
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("batch deregister failed", "error", err)
|
2018-03-14 22:32:18 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
reply.Index = index
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-03-20 22:00:31 +00:00
|
|
|
// Scale is used to modify one of the scaling targets in the job
|
2020-01-17 16:51:35 +00:00
|
|
|
func (j *Job) Scale(args *structs.JobScaleRequest, reply *structs.JobRegisterResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Scale", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "scale"}, time.Now())
|
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
namespace := args.RequestNamespace()
|
2020-03-18 14:32:59 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
// Authorize request
|
|
|
|
aclObj, err := j.srv.ResolveToken(args.AuthToken)
|
|
|
|
if err != nil {
|
2020-01-17 16:51:35 +00:00
|
|
|
return err
|
2021-01-14 20:40:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if aclObj != nil {
|
|
|
|
hasScaleJob := aclObj.AllowNsOp(namespace, acl.NamespaceCapabilityScaleJob)
|
|
|
|
hasSubmitJob := aclObj.AllowNsOp(namespace, acl.NamespaceCapabilitySubmitJob)
|
2020-03-22 21:49:09 +00:00
|
|
|
if !(hasScaleJob || hasSubmitJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
2020-01-17 16:51:35 +00:00
|
|
|
}
|
|
|
|
|
2021-12-06 20:20:34 +00:00
|
|
|
if ok, err := registrationsAreAllowed(aclObj, j.srv.State()); !ok || err != nil {
|
|
|
|
j.logger.Warn("job scaling is currently disabled for non-management ACL")
|
|
|
|
return structs.ErrJobRegistrationDisabled
|
|
|
|
}
|
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
// Validate args
|
|
|
|
err = args.Validate()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find job
|
2020-01-17 16:51:35 +00:00
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2021-01-14 20:40:42 +00:00
|
|
|
|
2020-01-17 16:51:35 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2020-03-18 14:32:59 +00:00
|
|
|
job, err := snap.JobByID(ws, namespace, args.JobID)
|
2020-01-17 16:51:35 +00:00
|
|
|
if err != nil {
|
2020-06-17 17:03:35 +00:00
|
|
|
j.logger.Error("unable to lookup job", "error", err)
|
2020-01-17 16:51:35 +00:00
|
|
|
return err
|
|
|
|
}
|
2021-01-14 20:40:42 +00:00
|
|
|
|
2020-01-27 22:14:28 +00:00
|
|
|
if job == nil {
|
2020-03-18 14:32:59 +00:00
|
|
|
return structs.NewErrRPCCoded(404, fmt.Sprintf("job %q not found", args.JobID))
|
2020-01-27 22:14:28 +00:00
|
|
|
}
|
2020-01-17 16:51:35 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
// Find target group in job TaskGroups
|
|
|
|
groupName := args.Target[structs.ScalingTargetGroup]
|
|
|
|
var group *structs.TaskGroup
|
2020-03-23 13:38:18 +00:00
|
|
|
for _, tg := range job.TaskGroups {
|
2021-01-14 20:40:42 +00:00
|
|
|
if tg.Name == groupName {
|
|
|
|
group = tg
|
2020-03-23 13:38:18 +00:00
|
|
|
break
|
2020-03-22 21:49:09 +00:00
|
|
|
}
|
2020-03-23 13:38:18 +00:00
|
|
|
}
|
2021-01-14 20:40:42 +00:00
|
|
|
|
|
|
|
if group == nil {
|
2020-03-23 13:38:18 +00:00
|
|
|
return structs.NewErrRPCCoded(400,
|
|
|
|
fmt.Sprintf("task group %q specified for scaling does not exist in job", groupName))
|
|
|
|
}
|
|
|
|
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
2021-01-14 20:40:42 +00:00
|
|
|
prevCount := int64(group.Count)
|
2020-03-23 13:38:18 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
event := &structs.ScalingEventRequest{
|
|
|
|
Namespace: job.Namespace,
|
|
|
|
JobID: job.ID,
|
|
|
|
TaskGroup: groupName,
|
|
|
|
ScalingEvent: &structs.ScalingEvent{
|
|
|
|
Time: now,
|
|
|
|
PreviousCount: prevCount,
|
|
|
|
Count: args.Count,
|
|
|
|
Message: args.Message,
|
|
|
|
Error: args.Error,
|
|
|
|
Meta: args.Meta,
|
|
|
|
},
|
|
|
|
}
|
2020-06-17 17:03:35 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
if args.Count != nil {
|
|
|
|
// Further validation for count-based scaling event
|
|
|
|
if group.Scaling != nil {
|
|
|
|
if *args.Count < group.Scaling.Min {
|
2021-01-08 19:24:36 +00:00
|
|
|
return structs.NewErrRPCCoded(400,
|
2021-01-08 21:13:29 +00:00
|
|
|
fmt.Sprintf("group count was less than scaling policy minimum: %d < %d",
|
2021-01-14 20:40:42 +00:00
|
|
|
*args.Count, group.Scaling.Min))
|
2021-01-08 19:24:36 +00:00
|
|
|
}
|
2021-01-14 20:40:42 +00:00
|
|
|
if group.Scaling.Max < *args.Count {
|
2021-01-08 19:24:36 +00:00
|
|
|
return structs.NewErrRPCCoded(400,
|
2021-01-08 21:13:29 +00:00
|
|
|
fmt.Sprintf("group count was greater than scaling policy maximum: %d > %d",
|
2021-01-14 20:40:42 +00:00
|
|
|
*args.Count, group.Scaling.Max))
|
2021-01-08 19:24:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
// Update group count
|
|
|
|
group.Count = int(*args.Count)
|
|
|
|
|
|
|
|
// Block scaling event if there's an active deployment
|
|
|
|
deployment, err := snap.LatestDeploymentByJobID(ws, namespace, args.JobID)
|
2020-06-17 17:03:35 +00:00
|
|
|
if err != nil {
|
|
|
|
j.logger.Error("unable to lookup latest deployment", "error", err)
|
|
|
|
return err
|
|
|
|
}
|
2021-01-14 20:40:42 +00:00
|
|
|
|
|
|
|
if deployment != nil && deployment.Active() && deployment.JobCreateIndex == job.CreateIndex {
|
2021-11-23 15:20:18 +00:00
|
|
|
return structs.NewErrRPCCoded(400, "job scaling blocked due to active deployment")
|
2020-06-17 17:03:35 +00:00
|
|
|
}
|
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
// Commit the job update
|
|
|
|
_, jobModifyIndex, err := j.srv.raftApply(
|
|
|
|
structs.JobRegisterRequestType,
|
|
|
|
structs.JobRegisterRequest{
|
|
|
|
Job: job,
|
|
|
|
EnforceIndex: true,
|
|
|
|
JobModifyIndex: job.ModifyIndex,
|
|
|
|
PolicyOverride: args.PolicyOverride,
|
|
|
|
WriteRequest: args.WriteRequest,
|
|
|
|
},
|
|
|
|
)
|
2020-03-22 21:49:09 +00:00
|
|
|
if err != nil {
|
|
|
|
j.logger.Error("job register for scale failed", "error", err)
|
|
|
|
return err
|
|
|
|
}
|
2020-03-23 13:38:18 +00:00
|
|
|
reply.JobModifyIndex = jobModifyIndex
|
2020-03-18 14:32:59 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
// Create an eval for non-dispatch jobs
|
|
|
|
if !(job.IsPeriodic() || job.IsParameterized()) {
|
|
|
|
eval := &structs.Evaluation{
|
|
|
|
ID: uuid.Generate(),
|
|
|
|
Namespace: namespace,
|
2021-11-02 11:57:53 +00:00
|
|
|
Priority: job.Priority, // Safe as nil check performed above.
|
2021-01-14 20:40:42 +00:00
|
|
|
Type: structs.JobTypeService,
|
|
|
|
TriggeredBy: structs.EvalTriggerScaling,
|
|
|
|
JobID: args.JobID,
|
|
|
|
JobModifyIndex: reply.JobModifyIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
|
|
|
}
|
2020-01-17 16:51:35 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
_, evalIndex, err := j.srv.raftApply(
|
|
|
|
structs.EvalUpdateRequestType,
|
|
|
|
&structs.EvalUpdateRequest{
|
|
|
|
Evals: []*structs.Evaluation{eval},
|
|
|
|
WriteRequest: structs.WriteRequest{Region: args.Region},
|
|
|
|
},
|
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
j.logger.Error("eval create failed", "error", err, "method", "scale")
|
|
|
|
return err
|
|
|
|
}
|
2020-03-23 13:38:18 +00:00
|
|
|
|
2021-01-14 20:40:42 +00:00
|
|
|
reply.EvalID = eval.ID
|
|
|
|
reply.EvalCreateIndex = evalIndex
|
|
|
|
event.ScalingEvent.EvalID = &reply.EvalID
|
|
|
|
}
|
2020-03-23 13:38:18 +00:00
|
|
|
} else {
|
2021-01-14 20:40:42 +00:00
|
|
|
reply.JobModifyIndex = job.ModifyIndex
|
2020-03-23 13:38:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
_, eventIndex, err := j.srv.raftApply(structs.ScalingEventRegisterRequestType, event)
|
2020-01-17 16:51:35 +00:00
|
|
|
if err != nil {
|
2020-03-23 13:38:18 +00:00
|
|
|
j.logger.Error("scaling event create failed", "error", err)
|
2020-01-17 16:51:35 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-03-23 13:38:18 +00:00
|
|
|
reply.Index = eventIndex
|
2021-01-14 20:40:42 +00:00
|
|
|
|
2020-03-23 13:38:18 +00:00
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
2021-01-14 20:40:42 +00:00
|
|
|
|
2020-01-17 16:51:35 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-07-23 21:41:18 +00:00
|
|
|
// GetJob is used to request information about a specific job
|
|
|
|
func (j *Job) GetJob(args *structs.JobSpecificRequest,
|
|
|
|
reply *structs.SingleJobResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.GetJob", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "get_job"}, time.Now())
|
|
|
|
|
2017-09-26 17:38:03 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-26 17:38:03 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2015-10-29 22:01:29 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
2017-02-08 04:31:23 +00:00
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
2015-10-29 22:01:29 +00:00
|
|
|
// Look for the job
|
2017-09-07 23:56:15 +00:00
|
|
|
out, err := state.JobByID(ws, args.RequestNamespace(), args.JobID)
|
2015-10-29 22:01:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-07-23 21:41:18 +00:00
|
|
|
|
2015-10-29 22:01:29 +00:00
|
|
|
// Setup the output
|
2015-10-30 02:00:02 +00:00
|
|
|
reply.Job = out
|
2015-10-29 22:01:29 +00:00
|
|
|
if out != nil {
|
|
|
|
reply.Index = out.ModifyIndex
|
|
|
|
} else {
|
|
|
|
// Use the last index that affected the nodes table
|
2017-02-08 04:31:23 +00:00
|
|
|
index, err := state.Index("jobs")
|
2015-10-29 22:01:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
2015-07-23 21:41:18 +00:00
|
|
|
}
|
2015-09-06 19:18:45 +00:00
|
|
|
|
2017-04-13 22:47:59 +00:00
|
|
|
// GetJobVersions is used to retrieve all tracked versions of a job.
|
2017-06-30 00:16:20 +00:00
|
|
|
func (j *Job) GetJobVersions(args *structs.JobVersionsRequest,
|
2017-04-13 22:47:59 +00:00
|
|
|
reply *structs.JobVersionsResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.GetJobVersions", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "get_job_versions"}, time.Now())
|
|
|
|
|
2017-09-27 17:24:51 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-27 17:24:51 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2017-04-13 22:47:59 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
|
|
|
// Look for the job
|
2017-09-07 23:56:15 +00:00
|
|
|
out, err := state.JobVersionsByID(ws, args.RequestNamespace(), args.JobID)
|
2017-04-13 22:47:59 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup the output
|
|
|
|
reply.Versions = out
|
|
|
|
if len(out) != 0 {
|
|
|
|
reply.Index = out[0].ModifyIndex
|
2017-06-30 00:16:20 +00:00
|
|
|
|
|
|
|
// Compute the diffs
|
2017-06-30 01:42:37 +00:00
|
|
|
if args.Diffs {
|
|
|
|
for i := 0; i < len(out)-1; i++ {
|
|
|
|
old, new := out[i+1], out[i]
|
|
|
|
d, err := old.Diff(new, true)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to create job diff: %v", err)
|
|
|
|
}
|
|
|
|
reply.Diffs = append(reply.Diffs, d)
|
2017-06-30 00:16:20 +00:00
|
|
|
}
|
|
|
|
}
|
2017-04-13 22:47:59 +00:00
|
|
|
} else {
|
|
|
|
// Use the last index that affected the nodes table
|
2017-05-05 20:52:01 +00:00
|
|
|
index, err := state.Index("job_version")
|
2017-04-13 22:47:59 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
|
|
|
}
|
|
|
|
|
2020-05-19 13:51:41 +00:00
|
|
|
// allowedNSes returns a set (as map of ns->true) of the namespaces a token has access to.
|
|
|
|
// Returns `nil` set if the token has access to all namespaces
|
|
|
|
// and ErrPermissionDenied if the token has no capabilities on any namespace.
|
2020-09-09 22:30:40 +00:00
|
|
|
func allowedNSes(aclObj *acl.ACL, state *state.StateStore, allow func(ns string) bool) (map[string]bool, error) {
|
2020-05-18 17:47:13 +00:00
|
|
|
if aclObj == nil || aclObj.IsManagement() {
|
2020-05-19 13:51:41 +00:00
|
|
|
return nil, nil
|
2020-05-18 17:47:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// namespaces
|
|
|
|
nses, err := state.NamespaceNames()
|
|
|
|
if err != nil {
|
2020-05-19 13:51:41 +00:00
|
|
|
return nil, err
|
2020-05-18 17:47:13 +00:00
|
|
|
}
|
|
|
|
|
2020-05-19 13:51:41 +00:00
|
|
|
r := make(map[string]bool, len(nses))
|
2020-05-18 17:47:13 +00:00
|
|
|
|
2020-05-19 13:51:41 +00:00
|
|
|
for _, ns := range nses {
|
2020-09-09 22:30:40 +00:00
|
|
|
if allow(ns) {
|
2020-05-19 13:51:41 +00:00
|
|
|
r[ns] = true
|
2020-05-18 17:47:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-19 13:51:41 +00:00
|
|
|
if len(r) == 0 {
|
|
|
|
return nil, structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
|
|
|
return r, nil
|
2020-05-18 17:47:13 +00:00
|
|
|
}
|
|
|
|
|
2021-12-06 20:20:34 +00:00
|
|
|
// registrationsAreAllowed checks that the scheduler is not in
|
|
|
|
// RejectJobRegistration mode for load-shedding.
|
|
|
|
func registrationsAreAllowed(aclObj *acl.ACL, state *state.StateStore) (bool, error) {
|
|
|
|
_, cfg, err := state.SchedulerConfig()
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
if cfg != nil && !cfg.RejectJobRegistration {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
if aclObj != nil && aclObj.IsManagement() {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
|
2015-09-06 19:18:45 +00:00
|
|
|
// List is used to list the jobs registered in the system
|
2020-05-18 17:47:13 +00:00
|
|
|
func (j *Job) List(args *structs.JobListRequest, reply *structs.JobListResponse) error {
|
2015-09-06 19:18:45 +00:00
|
|
|
if done, err := j.srv.forward("Job.List", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "list"}, time.Now())
|
|
|
|
|
2020-06-17 19:10:44 +00:00
|
|
|
if args.RequestNamespace() == structs.AllNamespacesSentinel {
|
2020-05-18 17:47:13 +00:00
|
|
|
return j.listAllNamespaces(args, reply)
|
|
|
|
}
|
|
|
|
|
2017-09-14 22:46:00 +00:00
|
|
|
// Check for list-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-14 22:46:00 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityListJobs) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2015-10-27 21:36:32 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
2015-10-29 21:47:39 +00:00
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
2017-02-08 04:31:23 +00:00
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
2015-10-27 21:36:32 +00:00
|
|
|
// Capture all the jobs
|
2017-02-08 04:31:23 +00:00
|
|
|
var err error
|
2015-12-24 10:46:59 +00:00
|
|
|
var iter memdb.ResultIterator
|
|
|
|
if prefix := args.QueryOptions.Prefix; prefix != "" {
|
2017-09-07 23:56:15 +00:00
|
|
|
iter, err = state.JobsByIDPrefix(ws, args.RequestNamespace(), prefix)
|
2015-12-24 10:46:59 +00:00
|
|
|
} else {
|
2017-09-07 23:56:15 +00:00
|
|
|
iter, err = state.JobsByNamespace(ws, args.RequestNamespace())
|
2015-12-24 10:46:59 +00:00
|
|
|
}
|
2015-10-27 21:36:32 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2015-10-28 19:43:00 +00:00
|
|
|
var jobs []*structs.JobListStub
|
2015-10-27 21:36:32 +00:00
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
job := raw.(*structs.Job)
|
2017-09-07 23:56:15 +00:00
|
|
|
summary, err := state.JobSummaryByID(ws, args.RequestNamespace(), job.ID)
|
2016-07-22 06:13:07 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to look up summary for job: %v", job.ID)
|
|
|
|
}
|
2016-07-21 20:21:47 +00:00
|
|
|
jobs = append(jobs, job.Stub(summary))
|
2015-10-27 21:36:32 +00:00
|
|
|
}
|
2015-10-28 19:43:00 +00:00
|
|
|
reply.Jobs = jobs
|
2015-10-27 21:36:32 +00:00
|
|
|
|
2018-03-15 17:22:03 +00:00
|
|
|
// Use the last index that affected the jobs table or summary
|
|
|
|
jindex, err := state.Index("jobs")
|
2015-10-27 21:36:32 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-03-15 17:22:03 +00:00
|
|
|
sindex, err := state.Index("job_summary")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = helper.Uint64Max(jindex, sindex)
|
2015-10-27 21:36:32 +00:00
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
2015-09-06 19:18:45 +00:00
|
|
|
}
|
|
|
|
|
2020-05-18 17:47:13 +00:00
|
|
|
// listAllNamespaces lists all jobs across all namespaces
|
|
|
|
func (j *Job) listAllNamespaces(args *structs.JobListRequest, reply *structs.JobListResponse) error {
|
|
|
|
// Check for list-job permissions
|
|
|
|
aclObj, err := j.srv.ResolveToken(args.AuthToken)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
prefix := args.QueryOptions.Prefix
|
2020-09-09 22:30:40 +00:00
|
|
|
allow := func(ns string) bool {
|
|
|
|
return aclObj.AllowNsOp(ns, acl.NamespaceCapabilityListJobs)
|
|
|
|
}
|
2020-05-18 17:47:13 +00:00
|
|
|
|
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
|
|
|
// check if user has permission to all namespaces
|
2020-09-09 22:30:40 +00:00
|
|
|
allowedNSes, err := allowedNSes(aclObj, state, allow)
|
2020-05-19 13:51:41 +00:00
|
|
|
if err == structs.ErrPermissionDenied {
|
|
|
|
// return empty jobs if token isn't authorized for any
|
|
|
|
// namespace, matching other endpoints
|
|
|
|
reply.Jobs = []*structs.JobListStub{}
|
|
|
|
return nil
|
|
|
|
} else if err != nil {
|
2020-05-18 17:47:13 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Capture all the jobs
|
|
|
|
iter, err := state.Jobs(ws)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
var jobs []*structs.JobListStub
|
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
job := raw.(*structs.Job)
|
2020-05-19 13:51:41 +00:00
|
|
|
if allowedNSes != nil && !allowedNSes[job.Namespace] {
|
|
|
|
// not permitted to this name namespace
|
|
|
|
continue
|
|
|
|
}
|
2020-05-18 17:47:13 +00:00
|
|
|
if prefix != "" && !strings.HasPrefix(job.ID, prefix) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
summary, err := state.JobSummaryByID(ws, job.Namespace, job.ID)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to look up summary for job: %v", job.ID)
|
|
|
|
}
|
|
|
|
|
|
|
|
stub := job.Stub(summary)
|
|
|
|
jobs = append(jobs, stub)
|
|
|
|
}
|
|
|
|
reply.Jobs = jobs
|
|
|
|
|
|
|
|
// Use the last index that affected the jobs table or summary
|
|
|
|
jindex, err := state.Index("jobs")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sindex, err := state.Index("job_summary")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = helper.Uint64Max(jindex, sindex)
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2015-09-06 19:18:45 +00:00
|
|
|
// Allocations is used to list the allocations for a job
|
|
|
|
func (j *Job) Allocations(args *structs.JobSpecificRequest,
|
|
|
|
reply *structs.JobAllocationsResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Allocations", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "allocations"}, time.Now())
|
|
|
|
|
2017-09-26 18:01:23 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-26 18:01:23 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2019-11-13 23:36:15 +00:00
|
|
|
// Ensure JobID is set otherwise everything works and never returns
|
|
|
|
// allocations which can hide bugs in request code.
|
|
|
|
if args.JobID == "" {
|
|
|
|
return fmt.Errorf("missing job ID")
|
|
|
|
}
|
|
|
|
|
2015-10-29 22:26:14 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
2017-02-08 04:31:23 +00:00
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
2015-10-29 22:26:14 +00:00
|
|
|
// Capture the allocations
|
2019-05-10 22:15:27 +00:00
|
|
|
allocs, err := state.AllocsByJob(ws, args.RequestNamespace(), args.JobID, args.All)
|
2015-10-29 22:26:14 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-09-06 19:18:45 +00:00
|
|
|
|
2015-10-29 22:26:14 +00:00
|
|
|
// Convert to stubs
|
|
|
|
if len(allocs) > 0 {
|
|
|
|
reply.Allocations = make([]*structs.AllocListStub, 0, len(allocs))
|
|
|
|
for _, alloc := range allocs {
|
2020-10-09 05:21:41 +00:00
|
|
|
reply.Allocations = append(reply.Allocations, alloc.Stub(nil))
|
2015-10-29 22:26:14 +00:00
|
|
|
}
|
|
|
|
}
|
2015-09-06 23:14:41 +00:00
|
|
|
|
2015-10-29 22:26:14 +00:00
|
|
|
// Use the last index that affected the allocs table
|
2017-02-08 04:31:23 +00:00
|
|
|
index, err := state.Index("allocs")
|
2015-10-29 22:26:14 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
2015-09-06 19:24:25 +00:00
|
|
|
|
2015-10-29 22:26:14 +00:00
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
2015-09-06 19:18:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Evaluations is used to list the evaluations for a job
|
|
|
|
func (j *Job) Evaluations(args *structs.JobSpecificRequest,
|
|
|
|
reply *structs.JobEvaluationsResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Evaluations", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "evaluations"}, time.Now())
|
|
|
|
|
2017-09-26 20:12:37 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-26 20:12:37 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2016-10-30 00:30:34 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
2017-02-08 04:31:23 +00:00
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
2016-10-30 00:30:34 +00:00
|
|
|
// Capture the evals
|
2017-02-08 04:31:23 +00:00
|
|
|
var err error
|
2017-09-07 23:56:15 +00:00
|
|
|
reply.Evaluations, err = state.EvalsByJob(ws, args.RequestNamespace(), args.JobID)
|
2016-10-30 00:30:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-09-06 19:24:25 +00:00
|
|
|
|
2016-10-30 00:30:34 +00:00
|
|
|
// Use the last index that affected the evals table
|
2017-02-08 04:31:23 +00:00
|
|
|
index, err := state.Index("evals")
|
2016-10-30 00:30:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
|
|
|
|
return j.srv.blockingRPC(&opts)
|
2015-09-06 19:18:45 +00:00
|
|
|
}
|
2016-05-05 18:21:58 +00:00
|
|
|
|
2017-07-01 00:23:34 +00:00
|
|
|
// Deployments is used to list the deployments for a job
|
|
|
|
func (j *Job) Deployments(args *structs.JobSpecificRequest,
|
|
|
|
reply *structs.DeploymentListResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Deployments", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "deployments"}, time.Now())
|
|
|
|
|
2017-09-26 20:33:03 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-26 20:33:03 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2017-07-01 00:23:34 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
|
|
|
// Capture the deployments
|
2019-05-10 22:15:27 +00:00
|
|
|
deploys, err := state.DeploymentsByJobID(ws, args.RequestNamespace(), args.JobID, args.All)
|
2017-07-01 00:23:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Use the last index that affected the deployment table
|
|
|
|
index, err := state.Index("deployment")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
|
|
|
reply.Deployments = deploys
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
|
|
|
}
|
|
|
|
|
|
|
|
// LatestDeployment is used to retrieve the latest deployment for a job
|
|
|
|
func (j *Job) LatestDeployment(args *structs.JobSpecificRequest,
|
|
|
|
reply *structs.SingleDeploymentResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.LatestDeployment", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "latest_deployment"}, time.Now())
|
|
|
|
|
2017-09-26 20:53:43 +00:00
|
|
|
// Check for read-job permissions
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-26 20:53:43 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2017-07-01 00:23:34 +00:00
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
|
|
|
// Capture the deployments
|
2019-05-10 22:15:27 +00:00
|
|
|
deploys, err := state.DeploymentsByJobID(ws, args.RequestNamespace(), args.JobID, args.All)
|
2017-07-01 00:23:34 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Use the last index that affected the deployment table
|
|
|
|
index, err := state.Index("deployment")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
reply.Index = index
|
|
|
|
if len(deploys) > 0 {
|
|
|
|
sort.Slice(deploys, func(i, j int) bool {
|
|
|
|
return deploys[i].CreateIndex > deploys[j].CreateIndex
|
|
|
|
})
|
|
|
|
reply.Deployment = deploys[0]
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
|
|
|
}
|
|
|
|
|
2016-05-05 18:21:58 +00:00
|
|
|
// Plan is used to cause a dry-run evaluation of the Job and return the results
|
|
|
|
// with a potential diff containing annotations.
|
|
|
|
func (j *Job) Plan(args *structs.JobPlanRequest, reply *structs.JobPlanResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Plan", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "plan"}, time.Now())
|
|
|
|
|
|
|
|
// Validate the arguments
|
|
|
|
if args.Job == nil {
|
|
|
|
return fmt.Errorf("Job required for plan")
|
|
|
|
}
|
|
|
|
|
2019-08-15 15:22:37 +00:00
|
|
|
// Run admission controllers
|
|
|
|
job, warnings, err := j.admissionControllers(args.Job)
|
2017-05-10 03:52:47 +00:00
|
|
|
if err != nil {
|
2016-05-05 18:21:58 +00:00
|
|
|
return err
|
|
|
|
}
|
2019-08-15 15:22:37 +00:00
|
|
|
args.Job = job
|
2016-05-05 18:21:58 +00:00
|
|
|
|
2017-07-07 02:08:51 +00:00
|
|
|
// Set the warning message
|
2019-08-15 15:22:37 +00:00
|
|
|
reply.Warnings = structs.MergeMultierrorWarnings(warnings...)
|
2017-07-07 02:08:51 +00:00
|
|
|
|
2017-09-19 14:47:10 +00:00
|
|
|
// Check job submission permissions, which we assume is the same for plan
|
2017-10-12 22:16:33 +00:00
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
2017-09-19 14:47:10 +00:00
|
|
|
return err
|
|
|
|
} else if aclObj != nil {
|
2017-11-20 20:00:24 +00:00
|
|
|
if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) {
|
2017-09-19 14:47:10 +00:00
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
// Check if override is set and we do not have permissions
|
|
|
|
if args.PolicyOverride {
|
2017-11-20 20:00:24 +00:00
|
|
|
if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySentinelOverride) {
|
2017-09-19 14:47:10 +00:00
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Enforce Sentinel policies
|
|
|
|
policyWarnings, err := j.enforceSubmitJob(args.PolicyOverride, args.Job)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if policyWarnings != nil {
|
2019-08-15 15:22:37 +00:00
|
|
|
warnings = append(warnings, policyWarnings)
|
|
|
|
reply.Warnings = structs.MergeMultierrorWarnings(warnings...)
|
2017-09-19 14:47:10 +00:00
|
|
|
}
|
|
|
|
|
2016-05-05 18:21:58 +00:00
|
|
|
// Acquire a snapshot of the state
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-06-24 17:24:55 +00:00
|
|
|
// Interpolate the job for this region
|
|
|
|
err = j.interpolateMultiregionFields(args)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-05-05 18:21:58 +00:00
|
|
|
// Get the original job
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
oldJob, err := snap.JobByID(ws, args.RequestNamespace(), args.Job.ID)
|
2016-05-05 18:21:58 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-07-30 16:27:36 +00:00
|
|
|
// Ensure that all scaling policies have an appropriate ID
|
|
|
|
if err := propagateScalingPolicyIDs(oldJob, args.Job); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-05-05 18:21:58 +00:00
|
|
|
var index uint64
|
2016-06-08 23:48:02 +00:00
|
|
|
var updatedIndex uint64
|
2017-05-23 23:33:55 +00:00
|
|
|
|
2017-06-29 18:01:41 +00:00
|
|
|
if oldJob != nil {
|
2016-06-08 23:48:02 +00:00
|
|
|
index = oldJob.JobModifyIndex
|
2016-05-05 18:21:58 +00:00
|
|
|
|
2017-06-29 18:01:41 +00:00
|
|
|
// We want to reuse deployments where possible, so only insert the job if
|
|
|
|
// it has changed or the job didn't exist
|
|
|
|
if oldJob.SpecChanged(args.Job) {
|
|
|
|
// Insert the updated Job into the snapshot
|
|
|
|
updatedIndex = oldJob.JobModifyIndex + 1
|
2020-10-19 13:30:15 +00:00
|
|
|
if err := snap.UpsertJob(structs.IgnoreUnknownTypeFlag, updatedIndex, args.Job); err != nil {
|
2020-07-30 16:27:36 +00:00
|
|
|
return err
|
|
|
|
}
|
2017-06-29 18:01:41 +00:00
|
|
|
}
|
2017-05-23 23:33:55 +00:00
|
|
|
} else if oldJob == nil {
|
|
|
|
// Insert the updated Job into the snapshot
|
2020-10-19 13:30:15 +00:00
|
|
|
err := snap.UpsertJob(structs.IgnoreUnknownTypeFlag, 100, args.Job)
|
2020-07-30 16:27:36 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-23 23:33:55 +00:00
|
|
|
}
|
2016-05-05 18:21:58 +00:00
|
|
|
|
|
|
|
// Create an eval and mark it as requiring annotations and insert that as well
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
2016-05-05 18:21:58 +00:00
|
|
|
eval := &structs.Evaluation{
|
2017-09-29 16:58:48 +00:00
|
|
|
ID: uuid.Generate(),
|
2017-09-07 23:56:15 +00:00
|
|
|
Namespace: args.RequestNamespace(),
|
2016-05-05 18:21:58 +00:00
|
|
|
Priority: args.Job.Priority,
|
|
|
|
Type: args.Job.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerJobRegister,
|
|
|
|
JobID: args.Job.ID,
|
2016-06-08 23:48:02 +00:00
|
|
|
JobModifyIndex: updatedIndex,
|
2016-05-05 18:21:58 +00:00
|
|
|
Status: structs.EvalStatusPending,
|
|
|
|
AnnotatePlan: true,
|
2019-08-07 16:50:35 +00:00
|
|
|
// Timestamps are added for consistency but this eval is never persisted
|
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
2016-05-05 18:21:58 +00:00
|
|
|
}
|
|
|
|
|
2020-10-19 13:30:15 +00:00
|
|
|
// Ignore eval event creation during snapshot eval creation
|
|
|
|
snap.UpsertEvals(structs.IgnoreUnknownTypeFlag, 100, []*structs.Evaluation{eval})
|
2017-12-18 16:03:55 +00:00
|
|
|
|
2016-05-05 18:21:58 +00:00
|
|
|
// Create an in-memory Planner that returns no errors and stores the
|
|
|
|
// submitted plan and created evals.
|
2016-05-16 19:49:18 +00:00
|
|
|
planner := &scheduler.Harness{
|
|
|
|
State: &snap.StateStore,
|
|
|
|
}
|
2016-05-05 18:21:58 +00:00
|
|
|
|
|
|
|
// Create the scheduler and run it
|
2022-01-15 01:09:14 +00:00
|
|
|
sched, err := scheduler.NewScheduler(eval.Type, j.logger, j.srv.workersEventCh, snap, planner)
|
2016-05-05 18:21:58 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := sched.Process(eval); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Annotate and store the diff
|
2016-05-16 19:49:18 +00:00
|
|
|
if plans := len(planner.Plans); plans != 1 {
|
2016-05-31 21:51:23 +00:00
|
|
|
return fmt.Errorf("scheduler resulted in an unexpected number of plans: %v", plans)
|
2016-05-16 19:49:18 +00:00
|
|
|
}
|
|
|
|
annotations := planner.Plans[0].Annotations
|
2016-05-05 18:21:58 +00:00
|
|
|
if args.Diff {
|
2016-05-11 22:36:28 +00:00
|
|
|
jobDiff, err := oldJob.Diff(args.Job, true)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to create job diff: %v", err)
|
|
|
|
}
|
|
|
|
|
2016-05-12 18:29:38 +00:00
|
|
|
if err := scheduler.Annotate(jobDiff, annotations); err != nil {
|
2016-05-11 22:36:28 +00:00
|
|
|
return fmt.Errorf("failed to annotate job diff: %v", err)
|
|
|
|
}
|
2016-05-05 18:21:58 +00:00
|
|
|
reply.Diff = jobDiff
|
|
|
|
}
|
|
|
|
|
2016-05-31 21:51:23 +00:00
|
|
|
// Grab the failures
|
|
|
|
if len(planner.Evals) != 1 {
|
|
|
|
return fmt.Errorf("scheduler resulted in an unexpected number of eval updates: %v", planner.Evals)
|
|
|
|
}
|
|
|
|
updatedEval := planner.Evals[0]
|
|
|
|
|
2016-06-15 20:34:45 +00:00
|
|
|
// If it is a periodic job calculate the next launch
|
|
|
|
if args.Job.IsPeriodic() && args.Job.Periodic.Enabled {
|
2018-04-26 20:57:45 +00:00
|
|
|
reply.NextPeriodicLaunch, err = args.Job.Periodic.Next(time.Now().In(args.Job.Periodic.GetLocation()))
|
|
|
|
if err != nil {
|
2018-04-26 22:15:43 +00:00
|
|
|
return fmt.Errorf("Failed to parse cron expression: %v", err)
|
2018-04-26 20:57:45 +00:00
|
|
|
}
|
2016-06-15 20:34:45 +00:00
|
|
|
}
|
|
|
|
|
2016-05-31 21:51:23 +00:00
|
|
|
reply.FailedTGAllocs = updatedEval.FailedTGAllocs
|
2016-05-16 18:48:44 +00:00
|
|
|
reply.JobModifyIndex = index
|
2016-05-12 18:29:38 +00:00
|
|
|
reply.Annotations = annotations
|
2016-05-16 19:49:18 +00:00
|
|
|
reply.CreatedEvals = planner.CreateEvals
|
2016-05-12 01:51:48 +00:00
|
|
|
reply.Index = index
|
2016-05-05 18:21:58 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-06-27 23:08:18 +00:00
|
|
|
// validateJobUpdate ensures updates to a job are valid.
|
|
|
|
func validateJobUpdate(old, new *structs.Job) error {
|
2018-06-11 17:27:48 +00:00
|
|
|
// Validate Dispatch not set on new Jobs
|
2018-06-11 15:59:03 +00:00
|
|
|
if old == nil {
|
|
|
|
if new.Dispatched {
|
|
|
|
return fmt.Errorf("job can't be submitted with 'Dispatched' set")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-06-27 23:08:18 +00:00
|
|
|
// Type transitions are disallowed
|
|
|
|
if old.Type != new.Type {
|
|
|
|
return fmt.Errorf("cannot update job from type %q to %q", old.Type, new.Type)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Transitioning to/from periodic is disallowed
|
|
|
|
if old.IsPeriodic() && !new.IsPeriodic() {
|
2017-11-18 10:50:52 +00:00
|
|
|
return fmt.Errorf("cannot update periodic job to being non-periodic")
|
2017-06-27 23:08:18 +00:00
|
|
|
}
|
|
|
|
if new.IsPeriodic() && !old.IsPeriodic() {
|
2017-11-18 10:50:52 +00:00
|
|
|
return fmt.Errorf("cannot update non-periodic job to being periodic")
|
2017-06-27 23:08:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Transitioning to/from parameterized is disallowed
|
|
|
|
if old.IsParameterized() && !new.IsParameterized() {
|
2021-04-12 13:27:04 +00:00
|
|
|
return fmt.Errorf("cannot update parameterized job to being non-parameterized")
|
2017-06-27 23:08:18 +00:00
|
|
|
}
|
|
|
|
if new.IsParameterized() && !old.IsParameterized() {
|
2021-04-12 13:27:04 +00:00
|
|
|
return fmt.Errorf("cannot update non-parameterized job to being parameterized")
|
2017-06-27 23:08:18 +00:00
|
|
|
}
|
|
|
|
|
2018-06-11 15:59:03 +00:00
|
|
|
if old.Dispatched != new.Dispatched {
|
|
|
|
return fmt.Errorf("field 'Dispatched' is read-only")
|
|
|
|
}
|
|
|
|
|
2017-06-27 23:08:18 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-01-20 18:33:52 +00:00
|
|
|
// Dispatch a parameterized job.
|
2016-11-26 02:04:55 +00:00
|
|
|
func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispatchResponse) error {
|
|
|
|
if done, err := j.srv.forward("Job.Dispatch", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "dispatch"}, time.Now())
|
|
|
|
|
2017-09-27 16:30:13 +00:00
|
|
|
// Check for submit-job permissions
|
2022-02-02 20:03:18 +00:00
|
|
|
aclObj, err := j.srv.ResolveToken(args.AuthToken)
|
|
|
|
if err != nil {
|
2017-09-27 16:30:13 +00:00
|
|
|
return err
|
2017-09-28 14:27:51 +00:00
|
|
|
} else if aclObj != nil && !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityDispatchJob) {
|
2017-09-27 16:30:13 +00:00
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
|
2021-12-06 20:20:34 +00:00
|
|
|
if ok, err := registrationsAreAllowed(aclObj, j.srv.State()); !ok || err != nil {
|
|
|
|
j.logger.Warn("job dispatch is currently disabled for non-management ACL")
|
|
|
|
return structs.ErrJobRegistrationDisabled
|
|
|
|
}
|
|
|
|
|
2017-01-20 18:33:52 +00:00
|
|
|
// Lookup the parameterized job
|
2016-11-26 02:04:55 +00:00
|
|
|
if args.JobID == "" {
|
2017-01-20 18:33:52 +00:00
|
|
|
return fmt.Errorf("missing parameterized job ID")
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
snap, err := j.srv.fsm.State().Snapshot()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-02-08 04:31:23 +00:00
|
|
|
ws := memdb.NewWatchSet()
|
2017-09-07 23:56:15 +00:00
|
|
|
parameterizedJob, err := snap.JobByID(ws, args.RequestNamespace(), args.JobID)
|
2016-11-26 02:04:55 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-01-20 18:33:52 +00:00
|
|
|
if parameterizedJob == nil {
|
|
|
|
return fmt.Errorf("parameterized job not found")
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
2017-01-20 18:33:52 +00:00
|
|
|
if !parameterizedJob.IsParameterized() {
|
|
|
|
return fmt.Errorf("Specified job %q is not a parameterized job", args.JobID)
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
2017-04-15 23:47:19 +00:00
|
|
|
if parameterizedJob.Stop {
|
|
|
|
return fmt.Errorf("Specified job %q is stopped", args.JobID)
|
|
|
|
}
|
|
|
|
|
2016-11-26 02:04:55 +00:00
|
|
|
// Validate the arguments
|
2017-01-20 18:33:52 +00:00
|
|
|
if err := validateDispatchRequest(args, parameterizedJob); err != nil {
|
2016-11-26 02:04:55 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2021-07-07 20:54:56 +00:00
|
|
|
// Avoid creating new dispatched jobs for retry requests, by using the idempotency token
|
2021-06-29 20:52:12 +00:00
|
|
|
if args.IdempotencyToken != "" {
|
2021-06-23 21:51:59 +00:00
|
|
|
// Fetch all jobs that match the parameterized job ID prefix
|
|
|
|
iter, err := snap.JobsByIDPrefix(ws, parameterizedJob.Namespace, parameterizedJob.ID)
|
|
|
|
if err != nil {
|
2021-07-02 15:58:42 +00:00
|
|
|
errMsg := "failed to retrieve jobs for idempotency check"
|
|
|
|
j.logger.Error(errMsg, "error", err)
|
|
|
|
return fmt.Errorf(errMsg)
|
2021-06-23 21:51:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Iterate
|
|
|
|
for {
|
|
|
|
raw := iter.Next()
|
|
|
|
if raw == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure the parent ID is an exact match
|
|
|
|
existingJob := raw.(*structs.Job)
|
2021-07-02 15:58:42 +00:00
|
|
|
if existingJob.ParentID != parameterizedJob.ID {
|
2021-06-23 21:51:59 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-07-02 19:08:46 +00:00
|
|
|
// Idempotency tokens match
|
2021-06-29 20:52:12 +00:00
|
|
|
if existingJob.DispatchIdempotencyToken == args.IdempotencyToken {
|
2021-07-02 19:08:46 +00:00
|
|
|
// The existing job has not yet been garbage collected.
|
2021-06-29 20:52:12 +00:00
|
|
|
// Registering a new job would violate the idempotency token.
|
2021-07-02 19:08:46 +00:00
|
|
|
// Return the existing job.
|
|
|
|
reply.JobCreateIndex = existingJob.CreateIndex
|
|
|
|
reply.DispatchedJobID = existingJob.ID
|
|
|
|
reply.Index = existingJob.ModifyIndex
|
|
|
|
|
|
|
|
return nil
|
2021-06-23 21:51:59 +00:00
|
|
|
}
|
|
|
|
}
|
2021-07-02 15:58:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Derive the child job and commit it via Raft - with initial status
|
|
|
|
dispatchJob := parameterizedJob.Copy()
|
|
|
|
dispatchJob.ID = structs.DispatchedID(parameterizedJob.ID, time.Now())
|
|
|
|
dispatchJob.ParentID = parameterizedJob.ID
|
|
|
|
dispatchJob.Name = dispatchJob.ID
|
|
|
|
dispatchJob.SetSubmitTime()
|
|
|
|
dispatchJob.Dispatched = true
|
|
|
|
dispatchJob.Status = ""
|
|
|
|
dispatchJob.StatusDescription = ""
|
|
|
|
dispatchJob.DispatchIdempotencyToken = args.IdempotencyToken
|
2021-06-23 21:51:59 +00:00
|
|
|
|
2021-07-02 15:58:42 +00:00
|
|
|
// Merge in the meta data
|
|
|
|
for k, v := range args.Meta {
|
|
|
|
if dispatchJob.Meta == nil {
|
|
|
|
dispatchJob.Meta = make(map[string]string, len(args.Meta))
|
|
|
|
}
|
|
|
|
dispatchJob.Meta[k] = v
|
2021-06-23 21:51:59 +00:00
|
|
|
}
|
|
|
|
|
2016-12-14 20:50:08 +00:00
|
|
|
// Compress the payload
|
|
|
|
dispatchJob.Payload = snappy.Encode(nil, args.Payload)
|
2016-11-26 04:02:18 +00:00
|
|
|
|
2016-11-26 02:04:55 +00:00
|
|
|
regReq := &structs.JobRegisterRequest{
|
|
|
|
Job: dispatchJob,
|
|
|
|
WriteRequest: args.WriteRequest,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Commit this update via Raft
|
2017-09-19 14:47:10 +00:00
|
|
|
fsmErr, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq)
|
|
|
|
if err, ok := fsmErr.(error); ok && err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("dispatched job register failed", "error", err, "fsm", true)
|
2017-09-19 14:47:10 +00:00
|
|
|
return err
|
|
|
|
}
|
2016-11-26 02:04:55 +00:00
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("dispatched job register failed", "error", err, "raft", true)
|
2016-11-26 02:04:55 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-03-27 23:55:17 +00:00
|
|
|
reply.JobCreateIndex = jobCreateIndex
|
|
|
|
reply.DispatchedJobID = dispatchJob.ID
|
|
|
|
reply.Index = jobCreateIndex
|
|
|
|
|
|
|
|
// If the job is periodic, we don't create an eval.
|
|
|
|
if !dispatchJob.IsPeriodic() {
|
|
|
|
// Create a new evaluation
|
2020-07-15 12:49:17 +00:00
|
|
|
now := time.Now().UnixNano()
|
2017-03-27 23:55:17 +00:00
|
|
|
eval := &structs.Evaluation{
|
2017-09-29 16:58:48 +00:00
|
|
|
ID: uuid.Generate(),
|
2017-09-07 23:56:15 +00:00
|
|
|
Namespace: args.RequestNamespace(),
|
2017-03-27 23:55:17 +00:00
|
|
|
Priority: dispatchJob.Priority,
|
|
|
|
Type: dispatchJob.Type,
|
|
|
|
TriggeredBy: structs.EvalTriggerJobRegister,
|
|
|
|
JobID: dispatchJob.ID,
|
|
|
|
JobModifyIndex: jobCreateIndex,
|
|
|
|
Status: structs.EvalStatusPending,
|
2019-08-07 16:50:35 +00:00
|
|
|
CreateTime: now,
|
|
|
|
ModifyTime: now,
|
2017-03-27 23:55:17 +00:00
|
|
|
}
|
|
|
|
update := &structs.EvalUpdateRequest{
|
|
|
|
Evals: []*structs.Evaluation{eval},
|
|
|
|
WriteRequest: structs.WriteRequest{Region: args.Region},
|
|
|
|
}
|
2016-11-26 02:04:55 +00:00
|
|
|
|
2017-03-27 23:55:17 +00:00
|
|
|
// Commit this evaluation via Raft
|
|
|
|
_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
|
|
|
|
if err != nil {
|
2018-09-15 23:23:13 +00:00
|
|
|
j.logger.Error("eval create failed", "error", err, "method", "dispatch")
|
2017-03-27 23:55:17 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup the reply
|
|
|
|
reply.EvalID = eval.ID
|
|
|
|
reply.EvalCreateIndex = evalIndex
|
|
|
|
reply.Index = evalIndex
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// validateDispatchRequest returns whether the request is valid given the
|
2017-01-20 18:33:52 +00:00
|
|
|
// parameterized job.
|
2016-12-14 20:50:08 +00:00
|
|
|
func validateDispatchRequest(req *structs.JobDispatchRequest, job *structs.Job) error {
|
|
|
|
// Check the payload constraint is met
|
|
|
|
hasInputData := len(req.Payload) != 0
|
2017-01-20 18:33:52 +00:00
|
|
|
if job.ParameterizedJob.Payload == structs.DispatchPayloadRequired && !hasInputData {
|
|
|
|
return fmt.Errorf("Payload is not provided but required by parameterized job")
|
|
|
|
} else if job.ParameterizedJob.Payload == structs.DispatchPayloadForbidden && hasInputData {
|
|
|
|
return fmt.Errorf("Payload provided but forbidden by parameterized job")
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
2016-12-14 20:50:08 +00:00
|
|
|
// Check the payload doesn't exceed the size limit
|
|
|
|
if l := len(req.Payload); l > DispatchPayloadSizeLimit {
|
|
|
|
return fmt.Errorf("Payload exceeds maximum size; %d > %d", l, DispatchPayloadSizeLimit)
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the metadata is a set
|
|
|
|
keys := make(map[string]struct{}, len(req.Meta))
|
2020-08-25 00:39:01 +00:00
|
|
|
for k := range req.Meta {
|
2016-11-26 02:04:55 +00:00
|
|
|
if _, ok := keys[k]; ok {
|
|
|
|
return fmt.Errorf("Duplicate key %q in passed metadata", k)
|
|
|
|
}
|
|
|
|
keys[k] = struct{}{}
|
|
|
|
}
|
|
|
|
|
2017-01-20 18:33:52 +00:00
|
|
|
required := helper.SliceStringToSet(job.ParameterizedJob.MetaRequired)
|
|
|
|
optional := helper.SliceStringToSet(job.ParameterizedJob.MetaOptional)
|
2016-11-26 02:04:55 +00:00
|
|
|
|
|
|
|
// Check the metadata key constraints are met
|
|
|
|
unpermitted := make(map[string]struct{})
|
|
|
|
for k := range req.Meta {
|
|
|
|
_, req := required[k]
|
|
|
|
_, opt := optional[k]
|
2021-06-29 20:52:12 +00:00
|
|
|
if !req && !opt {
|
2016-11-26 02:04:55 +00:00
|
|
|
unpermitted[k] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(unpermitted) != 0 {
|
|
|
|
flat := make([]string, 0, len(unpermitted))
|
|
|
|
for k := range unpermitted {
|
|
|
|
flat = append(flat, k)
|
|
|
|
}
|
|
|
|
|
|
|
|
return fmt.Errorf("Dispatch request included unpermitted metadata keys: %v", flat)
|
|
|
|
}
|
|
|
|
|
|
|
|
missing := make(map[string]struct{})
|
2017-01-20 18:33:52 +00:00
|
|
|
for _, k := range job.ParameterizedJob.MetaRequired {
|
2016-11-26 02:04:55 +00:00
|
|
|
if _, ok := req.Meta[k]; !ok {
|
|
|
|
missing[k] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(missing) != 0 {
|
|
|
|
flat := make([]string, 0, len(missing))
|
|
|
|
for k := range missing {
|
|
|
|
flat = append(flat, k)
|
|
|
|
}
|
|
|
|
|
2016-12-02 23:37:26 +00:00
|
|
|
return fmt.Errorf("Dispatch did not provide required meta keys: %v", flat)
|
2016-11-26 02:04:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
2020-03-20 22:00:31 +00:00
|
|
|
|
|
|
|
// ScaleStatus retrieves the scaling status for a job
|
|
|
|
func (j *Job) ScaleStatus(args *structs.JobScaleStatusRequest,
|
|
|
|
reply *structs.JobScaleStatusResponse) error {
|
|
|
|
|
|
|
|
if done, err := j.srv.forward("Job.ScaleStatus", args, args, reply); done {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer metrics.MeasureSince([]string{"nomad", "job", "scale_status"}, time.Now())
|
|
|
|
|
2020-03-22 15:40:39 +00:00
|
|
|
// Check for autoscaler permissions
|
|
|
|
if aclObj, err := j.srv.ResolveToken(args.AuthToken); err != nil {
|
|
|
|
return err
|
|
|
|
} else if aclObj != nil {
|
|
|
|
hasReadJob := aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob)
|
|
|
|
hasReadJobScaling := aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJobScaling)
|
|
|
|
if !(hasReadJob || hasReadJobScaling) {
|
|
|
|
return structs.ErrPermissionDenied
|
|
|
|
}
|
|
|
|
}
|
2020-03-20 22:00:31 +00:00
|
|
|
|
|
|
|
// Setup the blocking query
|
|
|
|
opts := blockingOptions{
|
|
|
|
queryOpts: &args.QueryOptions,
|
|
|
|
queryMeta: &reply.QueryMeta,
|
|
|
|
run: func(ws memdb.WatchSet, state *state.StateStore) error {
|
|
|
|
|
|
|
|
// We need the job and the job summary
|
|
|
|
job, err := state.JobByID(ws, args.RequestNamespace(), args.JobID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if job == nil {
|
2020-03-21 14:18:43 +00:00
|
|
|
reply.JobScaleStatus = nil
|
|
|
|
return nil
|
2020-03-20 22:00:31 +00:00
|
|
|
}
|
|
|
|
|
2020-04-01 17:28:19 +00:00
|
|
|
events, eventsIndex, err := state.ScalingEventsByJob(ws, args.RequestNamespace(), args.JobID)
|
2020-03-23 13:38:18 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if events == nil {
|
|
|
|
events = make(map[string][]*structs.ScalingEvent)
|
|
|
|
}
|
|
|
|
|
2020-04-27 20:10:09 +00:00
|
|
|
var allocs []*structs.Allocation
|
|
|
|
var allocsIndex uint64
|
|
|
|
allocs, err = state.AllocsByJob(ws, job.Namespace, job.ID, false)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-03-20 22:00:31 +00:00
|
|
|
// Setup the output
|
2020-03-21 14:18:43 +00:00
|
|
|
reply.JobScaleStatus = &structs.JobScaleStatus{
|
|
|
|
JobID: job.ID,
|
2020-07-24 09:19:25 +00:00
|
|
|
Namespace: job.Namespace,
|
2020-03-21 14:18:43 +00:00
|
|
|
JobCreateIndex: job.CreateIndex,
|
|
|
|
JobModifyIndex: job.ModifyIndex,
|
|
|
|
JobStopped: job.Stop,
|
|
|
|
TaskGroups: make(map[string]*structs.TaskGroupScaleStatus),
|
|
|
|
}
|
2020-03-20 22:00:31 +00:00
|
|
|
|
|
|
|
for _, tg := range job.TaskGroups {
|
|
|
|
tgScale := &structs.TaskGroupScaleStatus{
|
|
|
|
Desired: tg.Count,
|
|
|
|
}
|
2020-03-23 13:38:18 +00:00
|
|
|
tgScale.Events = events[tg.Name]
|
2020-03-21 14:18:43 +00:00
|
|
|
reply.JobScaleStatus.TaskGroups[tg.Name] = tgScale
|
2020-03-20 22:00:31 +00:00
|
|
|
}
|
|
|
|
|
2020-04-27 20:10:09 +00:00
|
|
|
for _, alloc := range allocs {
|
|
|
|
// TODO: ignore canaries until we figure out what we should do with canaries
|
|
|
|
if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if alloc.TerminalStatus() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
tgScale, ok := reply.JobScaleStatus.TaskGroups[alloc.TaskGroup]
|
|
|
|
if !ok || tgScale == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
tgScale.Placed++
|
|
|
|
if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
|
|
|
tgScale.Running++
|
|
|
|
}
|
|
|
|
if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.HasHealth() {
|
|
|
|
if alloc.DeploymentStatus.IsHealthy() {
|
|
|
|
tgScale.Healthy++
|
|
|
|
} else if alloc.DeploymentStatus.IsUnhealthy() {
|
|
|
|
tgScale.Unhealthy++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if alloc.ModifyIndex > allocsIndex {
|
|
|
|
allocsIndex = alloc.ModifyIndex
|
|
|
|
}
|
2020-04-01 17:28:19 +00:00
|
|
|
}
|
2020-04-27 20:10:09 +00:00
|
|
|
|
|
|
|
maxIndex := job.ModifyIndex
|
2020-04-01 17:28:19 +00:00
|
|
|
if eventsIndex > maxIndex {
|
|
|
|
maxIndex = eventsIndex
|
2020-03-20 22:00:31 +00:00
|
|
|
}
|
2020-04-27 20:10:09 +00:00
|
|
|
if allocsIndex > maxIndex {
|
|
|
|
maxIndex = allocsIndex
|
|
|
|
}
|
2020-04-01 17:28:19 +00:00
|
|
|
reply.Index = maxIndex
|
2020-03-20 22:00:31 +00:00
|
|
|
|
|
|
|
// Set the query response
|
|
|
|
j.srv.setQueryMeta(&reply.QueryMeta)
|
|
|
|
return nil
|
|
|
|
}}
|
|
|
|
return j.srv.blockingRPC(&opts)
|
|
|
|
}
|