open-nomad/nomad/deploymentwatcher/deployments_watcher.go

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

458 lines
13 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
2017-06-26 21:23:52 +00:00
package deploymentwatcher
import (
2017-06-28 04:36:16 +00:00
"context"
2017-06-26 21:23:52 +00:00
"fmt"
"sync"
2017-06-28 21:25:20 +00:00
"time"
2017-06-26 21:23:52 +00:00
2017-06-28 04:36:16 +00:00
"golang.org/x/time/rate"
2018-09-15 23:23:13 +00:00
log "github.com/hashicorp/go-hclog"
2017-08-31 00:45:32 +00:00
memdb "github.com/hashicorp/go-memdb"
2018-09-15 23:23:13 +00:00
2017-08-31 00:45:32 +00:00
"github.com/hashicorp/nomad/nomad/state"
2017-06-26 21:23:52 +00:00
"github.com/hashicorp/nomad/nomad/structs"
)
2017-07-03 19:05:01 +00:00
const (
// LimitStateQueriesPerSecond is the number of state queries allowed per
// second
LimitStateQueriesPerSecond = 100.0
// CrossDeploymentUpdateBatchDuration is the duration in which allocation
2018-04-10 22:02:52 +00:00
// desired transition and evaluation creation updates are batched across
// all deployment watchers before committing to Raft.
CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond
2017-07-03 19:05:01 +00:00
)
var (
// notEnabled is the error returned when the deployment watcher is not
// enabled
notEnabled = fmt.Errorf("deployment watcher not enabled")
)
2017-06-26 21:23:52 +00:00
// DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
// to apply data transforms via Raft.
type DeploymentRaftEndpoints interface {
// UpsertJob is used to upsert a job
UpsertJob(job *structs.Job) (uint64, error)
2017-07-03 19:05:01 +00:00
// UpdateDeploymentStatus is used to make a deployment status update
2017-06-26 21:23:52 +00:00
// and potentially create an evaluation.
2017-07-03 19:05:01 +00:00
UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
2017-06-26 21:23:52 +00:00
2017-07-03 19:05:01 +00:00
// UpdateDeploymentPromotion is used to promote canaries in a deployment
UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
2017-06-26 21:23:52 +00:00
2017-07-03 19:05:01 +00:00
// UpdateDeploymentAllocHealth is used to set the health of allocations in a
2017-06-26 21:23:52 +00:00
// deployment
2017-07-03 19:05:01 +00:00
UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
2018-04-10 22:02:52 +00:00
// UpdateAllocDesiredTransition is used to update the desired transition
// for allocations.
2018-04-10 22:02:52 +00:00
UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error)
2017-06-26 21:23:52 +00:00
}
// Watcher is used to watch deployments and their allocations created
// by the scheduler and trigger the scheduler when allocation health
2018-03-11 19:06:05 +00:00
// transitions.
2017-06-26 21:23:52 +00:00
type Watcher struct {
enabled bool
2018-09-15 23:23:13 +00:00
logger log.Logger
2017-06-26 21:23:52 +00:00
2017-06-28 04:36:16 +00:00
// queryLimiter is used to limit the rate of blocking queries
queryLimiter *rate.Limiter
// updateBatchDuration is the duration to batch allocation desired
2018-04-10 22:02:52 +00:00
// transition and eval creation across all deployment watchers
updateBatchDuration time.Duration
2017-06-28 21:25:20 +00:00
2017-06-26 21:23:52 +00:00
// raft contains the set of Raft endpoints that can be used by the
// deployments watcher
raft DeploymentRaftEndpoints
2017-08-31 00:45:32 +00:00
// state is the state that is watched for state changes.
state *state.StateStore
2017-06-26 21:23:52 +00:00
// server interface for Deployment RPCs
deploymentRPC DeploymentRPC
// server interface for Job RPCs
jobRPC JobRPC
2017-06-26 21:23:52 +00:00
// watchers is the set of active watchers, one per deployment
watchers map[string]*deploymentWatcher
// allocUpdateBatcher is used to batch the creation of evaluations and
2018-04-10 22:02:52 +00:00
// allocation desired transition updates
allocUpdateBatcher *AllocUpdateBatcher
2017-06-26 21:23:52 +00:00
2017-06-28 04:36:16 +00:00
// ctx and exitFn are used to cancel the watcher
ctx context.Context
exitFn context.CancelFunc
2017-06-26 21:23:52 +00:00
l sync.RWMutex
}
// NewDeploymentsWatcher returns a deployments watcher that is used to watch
// deployments and trigger the scheduler as needed.
2018-09-15 23:23:13 +00:00
func NewDeploymentsWatcher(logger log.Logger,
raft DeploymentRaftEndpoints,
deploymentRPC DeploymentRPC, jobRPC JobRPC,
stateQueriesPerSecond float64,
updateBatchDuration time.Duration,
) *Watcher {
2017-06-28 22:35:52 +00:00
2017-06-26 21:23:52 +00:00
return &Watcher{
raft: raft,
deploymentRPC: deploymentRPC,
jobRPC: jobRPC,
queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
updateBatchDuration: updateBatchDuration,
2018-09-15 23:23:13 +00:00
logger: logger.Named("deployments_watcher"),
2017-06-26 21:23:52 +00:00
}
}
// SetEnabled is used to control if the watcher is enabled. The watcher
2017-08-31 00:45:32 +00:00
// should only be enabled on the active leader. When being enabled the state is
2018-03-11 18:36:49 +00:00
// passed in as it is no longer valid once a leader election has taken place.
2018-02-27 00:28:10 +00:00
func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) {
2017-06-26 21:23:52 +00:00
w.l.Lock()
2017-07-21 22:44:51 +00:00
defer w.l.Unlock()
2017-06-27 18:52:14 +00:00
wasEnabled := w.enabled
2017-06-26 21:23:52 +00:00
w.enabled = enabled
2017-06-28 22:35:52 +00:00
2017-08-31 00:45:32 +00:00
if state != nil {
w.state = state
}
2017-06-28 22:35:52 +00:00
// Flush the state to create the necessary objects
w.flush(enabled)
2017-06-28 22:35:52 +00:00
// If we are starting now, launch the watch daemon
if enabled && !wasEnabled {
2017-07-21 22:44:51 +00:00
go w.watchDeployments(w.ctx)
2017-06-26 21:23:52 +00:00
}
}
2017-07-03 19:05:01 +00:00
// flush is used to clear the state of the watcher
func (w *Watcher) flush(enabled bool) {
2017-06-26 21:23:52 +00:00
// Stop all the watchers and clear it
for _, watcher := range w.watchers {
watcher.StopWatch()
}
2017-06-28 04:36:16 +00:00
// Kill everything associated with the watcher
2017-06-28 22:35:52 +00:00
if w.exitFn != nil {
w.exitFn()
}
2017-06-26 21:23:52 +00:00
w.watchers = make(map[string]*deploymentWatcher, 32)
2017-06-28 04:36:16 +00:00
w.ctx, w.exitFn = context.WithCancel(context.Background())
if enabled {
w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft)
} else {
w.allocUpdateBatcher = nil
}
2017-06-26 21:23:52 +00:00
}
2017-06-27 18:52:14 +00:00
// watchDeployments is the long lived go-routine that watches for deployments to
// add and remove watchers on.
2017-07-21 22:44:51 +00:00
func (w *Watcher) watchDeployments(ctx context.Context) {
dindex := uint64(1)
2017-06-27 18:52:14 +00:00
for {
// Block getting all deployments using the last deployment index.
2017-08-31 00:45:32 +00:00
deployments, idx, err := w.getDeploys(ctx, dindex)
2017-06-28 04:36:16 +00:00
if err != nil {
2017-08-31 20:03:35 +00:00
if err == context.Canceled {
2017-06-28 04:36:16 +00:00
return
}
2018-09-15 23:23:13 +00:00
w.logger.Error("failed to retrieve deployments", "error", err)
2017-06-27 18:52:14 +00:00
}
2017-08-31 00:45:32 +00:00
// Update the latest index
dindex = idx
2017-06-27 18:52:14 +00:00
// Ensure we are tracking the things we should and not tracking what we
// shouldn't be
2017-08-31 00:45:32 +00:00
for _, d := range deployments {
2017-06-27 18:52:14 +00:00
if d.Active() {
if err := w.add(d); err != nil {
2018-09-15 23:23:13 +00:00
w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err)
2017-06-27 18:52:14 +00:00
}
} else {
w.remove(d)
}
}
}
}
// getDeploys retrieves all deployments blocking at the given index.
2017-08-31 00:45:32 +00:00
func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) {
// state can be updated concurrently
w.l.Lock()
stateStore := w.state
w.l.Unlock()
resp, index, err := stateStore.BlockingQuery(w.getDeploysImpl, minIndex, ctx)
2017-08-31 00:45:32 +00:00
if err != nil {
return nil, 0, err
}
2017-06-28 04:36:16 +00:00
2017-08-31 00:45:32 +00:00
return resp.([]*structs.Deployment), index, nil
}
// getDeploysImpl retrieves all deployments from the passed state store.
func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, store *state.StateStore) (interface{}, uint64, error) {
2017-06-27 18:52:14 +00:00
iter, err := store.Deployments(ws, state.SortDefault)
2017-08-31 00:45:32 +00:00
if err != nil {
return nil, 0, err
}
var deploys []*structs.Deployment
for {
raw := iter.Next()
if raw == nil {
break
2017-06-27 18:52:14 +00:00
}
2017-08-31 00:45:32 +00:00
deploy := raw.(*structs.Deployment)
deploys = append(deploys, deploy)
2017-06-28 04:36:16 +00:00
}
2017-06-27 18:52:14 +00:00
2017-08-31 00:45:32 +00:00
// Use the last index that affected the deployment table
index, err := store.Index("deployment")
2017-08-31 00:45:32 +00:00
if err != nil {
return nil, 0, err
}
return deploys, index, nil
2017-06-27 18:52:14 +00:00
}
// add adds a deployment to the watch list
func (w *Watcher) add(d *structs.Deployment) error {
2017-06-26 21:23:52 +00:00
w.l.Lock()
defer w.l.Unlock()
2017-06-29 05:00:18 +00:00
_, err := w.addLocked(d)
return err
}
2017-06-26 21:23:52 +00:00
2017-06-29 05:00:18 +00:00
// addLocked adds a deployment to the watch list and should only be called when
2019-05-11 20:47:34 +00:00
// locked. Creating the deploymentWatcher starts a go routine to .watch() it
2017-06-29 05:00:18 +00:00
func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
2017-06-26 21:23:52 +00:00
// Not enabled so no-op
if !w.enabled {
2017-06-29 05:00:18 +00:00
return nil, nil
}
if !d.Active() {
return nil, fmt.Errorf("deployment %q is terminal", d.ID)
2017-06-26 21:23:52 +00:00
}
// Already watched so just update the deployment
if w, ok := w.watchers[d.ID]; ok {
w.updateDeployment(d)
2017-06-29 05:00:18 +00:00
return nil, nil
2017-06-27 18:52:14 +00:00
}
// Get the job the deployment is referencing
2017-08-31 00:45:32 +00:00
snap, err := w.state.Snapshot()
if err != nil {
return nil, err
2017-06-27 18:52:14 +00:00
}
2017-08-31 00:45:32 +00:00
2017-09-07 23:56:15 +00:00
job, err := snap.JobByID(nil, d.Namespace, d.JobID)
2017-08-31 00:45:32 +00:00
if err != nil {
2017-06-29 05:00:18 +00:00
return nil, err
2017-06-27 18:52:14 +00:00
}
2017-08-31 00:45:32 +00:00
if job == nil {
2017-06-29 05:00:18 +00:00
return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
2017-06-26 21:23:52 +00:00
}
watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job,
w, w.deploymentRPC, w.jobRPC)
2017-06-29 05:00:18 +00:00
w.watchers[d.ID] = watcher
return watcher, nil
2017-06-26 21:23:52 +00:00
}
2017-06-27 18:52:14 +00:00
// remove stops watching a deployment. This can be because the deployment is
2017-06-26 21:23:52 +00:00
// complete or being deleted.
2017-06-27 18:52:14 +00:00
func (w *Watcher) remove(d *structs.Deployment) {
2017-06-26 21:23:52 +00:00
w.l.Lock()
defer w.l.Unlock()
// Not enabled so no-op
if !w.enabled {
return
}
if watcher, ok := w.watchers[d.ID]; ok {
watcher.StopWatch()
delete(w.watchers, d.ID)
}
}
2017-06-29 05:00:18 +00:00
// forceAdd is used to force a lookup of the given deployment object and create
// a watcher. If the deployment does not exist or is terminal an error is
// returned.
func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
2017-08-31 00:45:32 +00:00
snap, err := w.state.Snapshot()
if err != nil {
return nil, err
}
deployment, err := snap.DeploymentByID(nil, dID)
if err != nil {
2017-06-29 05:00:18 +00:00
return nil, err
}
2017-08-31 00:45:32 +00:00
if deployment == nil {
2017-06-29 05:00:18 +00:00
return nil, fmt.Errorf("unknown deployment %q", dID)
}
2017-08-31 00:45:32 +00:00
return w.addLocked(deployment)
2017-06-29 05:00:18 +00:00
}
2017-07-03 19:05:01 +00:00
// getOrCreateWatcher returns the deployment watcher for the given deployment ID.
func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
2017-06-26 21:23:52 +00:00
w.l.Lock()
defer w.l.Unlock()
// Not enabled so no-op
if !w.enabled {
2017-06-29 05:00:18 +00:00
return nil, notEnabled
2017-06-26 21:23:52 +00:00
}
2017-06-29 05:00:18 +00:00
watcher, ok := w.watchers[dID]
if ok {
return watcher, nil
}
return w.forceAdd(dID)
}
// SetAllocHealth is used to set the health of allocations for a deployment. If
// there are any unhealthy allocations, the deployment is updated to be failed.
// Otherwise the allocations are updated and an evaluation is created.
func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
2017-07-03 19:05:01 +00:00
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
2017-06-29 05:00:18 +00:00
if err != nil {
return err
2017-06-26 21:23:52 +00:00
}
2017-06-28 04:36:16 +00:00
return watcher.SetAllocHealth(req, resp)
2017-06-26 21:23:52 +00:00
}
// PromoteDeployment is used to promote a deployment. If promote is false,
// deployment is marked as failed. Otherwise the deployment is updated and an
// evaluation is created.
2017-06-28 04:36:16 +00:00
func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
2017-07-03 19:05:01 +00:00
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
2017-06-29 05:00:18 +00:00
if err != nil {
return err
2017-06-26 21:23:52 +00:00
}
2017-06-28 04:36:16 +00:00
return watcher.PromoteDeployment(req, resp)
2017-06-26 21:23:52 +00:00
}
// PauseDeployment is used to toggle the pause state on a deployment. If the
// deployment is being unpaused, an evaluation is created.
2017-06-28 04:36:16 +00:00
func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
2017-07-03 19:05:01 +00:00
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
2017-06-29 05:00:18 +00:00
if err != nil {
return err
2017-06-26 21:23:52 +00:00
}
2017-06-28 04:36:16 +00:00
return watcher.PauseDeployment(req, resp)
2017-06-26 21:23:52 +00:00
}
2017-06-28 23:29:48 +00:00
// FailDeployment is used to fail the deployment.
2017-06-29 05:00:18 +00:00
func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
2017-07-03 19:05:01 +00:00
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
2017-06-29 05:00:18 +00:00
if err != nil {
return err
2017-06-28 23:29:48 +00:00
}
return watcher.FailDeployment(req, resp)
}
// RunDeployment is used to run a pending multiregion deployment. In
// single-region deployments, the pending state is unused.
func (w *Watcher) RunDeployment(req *structs.DeploymentRunRequest, resp *structs.DeploymentUpdateResponse) error {
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
if err != nil {
return err
}
return watcher.RunDeployment(req, resp)
}
2020-06-17 15:02:26 +00:00
// UnblockDeployment is used to unblock a multiregion deployment. In
// single-region deployments, the blocked state is unused.
func (w *Watcher) UnblockDeployment(req *structs.DeploymentUnblockRequest, resp *structs.DeploymentUpdateResponse) error {
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
if err != nil {
return err
}
return watcher.UnblockDeployment(req, resp)
}
// CancelDeployment is used to cancel a multiregion deployment. In
// single-region deployments, the deploymentwatcher has sole responsibility to
// cancel deployments so this RPC is never used.
func (w *Watcher) CancelDeployment(req *structs.DeploymentCancelRequest, resp *structs.DeploymentUpdateResponse) error {
watcher, err := w.getOrCreateWatcher(req.DeploymentID)
if err != nil {
return err
}
return watcher.CancelDeployment(req, resp)
}
2018-04-10 22:02:52 +00:00
// createUpdate commits the given allocation desired transition and evaluation
// to Raft but batches the commit with other calls.
func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) {
b := w.allocUpdateBatcher
if b == nil {
return 0, notEnabled
}
return b.CreateUpdate(allocs, eval).Results()
2017-06-26 21:23:52 +00:00
}
// upsertJob commits the given job to Raft
func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
return w.raft.UpsertJob(job)
}
// upsertDeploymentStatusUpdate commits the given deployment update and optional
// evaluation to Raft
func (w *Watcher) upsertDeploymentStatusUpdate(
u *structs.DeploymentStatusUpdate,
e *structs.Evaluation,
j *structs.Job) (uint64, error) {
2017-07-03 19:05:01 +00:00
return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
2017-06-26 21:23:52 +00:00
DeploymentUpdate: u,
Eval: e,
Job: j,
})
}
// upsertDeploymentPromotion commits the given deployment promotion to Raft
func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
2017-07-03 19:05:01 +00:00
return w.raft.UpdateDeploymentPromotion(req)
2017-06-26 21:23:52 +00:00
}
// upsertDeploymentAllocHealth commits the given allocation health changes to
// Raft
func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
2017-07-03 19:05:01 +00:00
return w.raft.UpdateDeploymentAllocHealth(req)
2017-06-26 21:23:52 +00:00
}