open-nomad/nomad/state/state_store.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package state

import (
	"context"
	"errors"
	"fmt"
	"reflect"
	"sort"
	"strings"
	"time"

	"github.com/hashicorp/go-bexpr"
	"github.com/hashicorp/go-hclog"
	"github.com/hashicorp/go-memdb"
	"github.com/hashicorp/go-multierror"
	"github.com/hashicorp/go-set"
	"github.com/hashicorp/nomad/helper/pointer"
	"github.com/hashicorp/nomad/lib/lang"
	"github.com/hashicorp/nomad/nomad/stream"
	"github.com/hashicorp/nomad/nomad/structs"
	"golang.org/x/exp/slices"
)

// Txn is a transaction against a state store.
// This can be a read or write transaction.
type Txn = *txn

// SortOption represents how results can be sorted.
type SortOption bool

const (
	// SortDefault indicates that the result should be returned using the
	// default go-memdb ResultIterator order.
	SortDefault SortOption = false

	// SortReverse indicates that the result should be returned using the
	// reversed go-memdb ResultIterator order.
	SortReverse SortOption = true
)

const (
	// NodeEligibilityEventPlanRejectThreshold is the message used when the node
	// is set to ineligible due to multiple plan failures.
	// This is a preventive measure to signal scheduler workers to not consider
	// the node for future placements.
	// Plan rejections for a node are expected due to the optimistic and
	// concurrent nature of the scheduling process, but repeated failures for
	// the same node may indicate an underlying issue not detected by Nomad.
	// The plan applier keeps track of plan rejection history and will mark
	// nodes as ineligible if they cross a given threshold.
	NodeEligibilityEventPlanRejectThreshold = "Node marked as ineligible for scheduling due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information"

	// NodeRegisterEventRegistered is the message used when the node becomes
	// registered.
	NodeRegisterEventRegistered = "Node registered"

	// NodeRegisterEventReregistered is the message used when the node becomes
	// re-registered.
	NodeRegisterEventReregistered = "Node re-registered"
)

// terminate appends the go-memdb terminator character to s.
//
// We can then use the result for exact matches during prefix
// scans over compound indexes that start with s.
func terminate(s string) string {
	return s + "\x00"
}

// IndexEntry is used with the "index" table
// for managing the latest Raft index affecting a table.
type IndexEntry struct {
	Key   string
	Value uint64
}

// StateStoreConfig is used to configure a new state store
type StateStoreConfig struct {
	// Logger is used to output the state store's logs
	Logger hclog.Logger

	// Region is the region of the server embedding the state store.
	Region string

	// EnablePublisher is used to enable or disable the event publisher
	EnablePublisher bool

	// EventBufferSize configures the amount of events to hold in memory
	EventBufferSize int64
}

// The StateStore is responsible for maintaining all the Nomad
// state. It is manipulated by the FSM which maintains consistency
// through the use of Raft. The goals of the StateStore are to provide
// high concurrency for read operations without blocking writes, and
// to provide write availability in the face of reads. EVERY object
// returned as a result of a read against the state store should be
// considered a constant and NEVER modified in place.
type StateStore struct {
	logger hclog.Logger
	db     *changeTrackerDB

	// config is the passed in configuration
	config *StateStoreConfig

	// abandonCh is used to signal watchers that this state store has been
	// abandoned (usually during a restore). This is only ever closed.
	abandonCh chan struct{}

	// TODO: refactor abandonCh to use a context so that both can use the same
	// cancel mechanism.
	stopEventBroker func()
}

type streamACLDelegate struct {
	s *StateStore
}

func (a *streamACLDelegate) TokenProvider() stream.ACLTokenProvider {
	resolver, _ := a.s.Snapshot()
	return resolver
}

// NewStateStore is used to create a new state store
func NewStateStore(config *StateStoreConfig) (*StateStore, error) {
	// Create the MemDB
	db, err := memdb.NewMemDB(stateStoreSchema())
	if err != nil {
		return nil, fmt.Errorf("state store setup failed: %v", err)
	}

	// Create the state store
	ctx, cancel := context.WithCancel(context.TODO())
	s := &StateStore{
		logger:          config.Logger.Named("state_store"),
		config:          config,
		abandonCh:       make(chan struct{}),
		stopEventBroker: cancel,
	}

	if config.EnablePublisher {
		// Create new event publisher using provided config
		broker, err := stream.NewEventBroker(ctx, &streamACLDelegate{s}, stream.EventBrokerCfg{
			EventBufferSize: config.EventBufferSize,
			Logger:          config.Logger,
		})
		if err != nil {
			return nil, fmt.Errorf("creating state store event broker %w", err)
		}
		s.db = NewChangeTrackerDB(db, broker, eventsFromChanges)
	} else {
		s.db = NewChangeTrackerDB(db, nil, noOpProcessChanges)
	}

	// Initialize the state store with the default namespace.
	if err := s.namespaceInit(); err != nil {
		return nil, fmt.Errorf("enterprise state store initialization failed: %v", err)
	}

	return s, nil
}

// NewWatchSet returns a new memdb.WatchSet that adds the state stores abandonCh
// as a watcher. This is important in that it will notify when this specific
// state store is no longer valid, usually due to a new snapshot being loaded
func (s *StateStore) NewWatchSet() memdb.WatchSet {
	ws := memdb.NewWatchSet()
	ws.Add(s.AbandonCh())
	return ws
}

func (s *StateStore) EventBroker() (*stream.EventBroker, error) {
	if s.db.publisher == nil {
		return nil, fmt.Errorf("EventBroker not configured")
	}
	return s.db.publisher, nil
}

// namespaceInit ensures the default namespace exists.
func (s *StateStore) namespaceInit() error {
	// Create the default namespace. This is safe to do every time we create the
	// state store. There are two main cases, a brand new cluster in which case
	// each server will have the same default namespace object, or a new cluster
	// in which case if the default namespace has been modified, it will be
	// overridden by the restore code path.
	defaultNs := &structs.Namespace{
		Name:        structs.DefaultNamespace,
		Description: structs.DefaultNamespaceDescription,
	}

	if err := s.UpsertNamespaces(1, []*structs.Namespace{defaultNs}); err != nil {
		return fmt.Errorf("inserting default namespace failed: %v", err)
	}

	return nil
}

// Config returns the state store configuration.
func (s *StateStore) Config() *StateStoreConfig {
	return s.config
}

// Snapshot is used to create a point in time snapshot. Because
// we use MemDB, we just need to snapshot the state of the underlying
// database.
func (s *StateStore) Snapshot() (*StateSnapshot, error) {
	memDBSnap := s.db.memdb.Snapshot()

	store := StateStore{
		logger: s.logger,
		config: s.config,
	}

	// Create a new change tracker DB that does not publish or track changes
	store.db = NewChangeTrackerDB(memDBSnap, nil, noOpProcessChanges)

	snap := &StateSnapshot{
		StateStore: store,
	}
	return snap, nil
}

// SnapshotMinIndex is used to create a state snapshot where the index is
// guaranteed to be greater than or equal to the index parameter.
//
// Some server operations (such as scheduling) exchange objects via RPC
// concurrent with Raft log application, so they must ensure the state store
// snapshot they are operating on is at or after the index the objects
// retrieved via RPC were applied to the Raft log at.
//
// Callers should maintain their own timer metric as the time this method
// blocks indicates Raft log application latency relative to scheduling.
func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*StateSnapshot, error) {
	// Ported from work.go:waitForIndex prior to 0.9

	const backoffBase = 20 * time.Millisecond
	const backoffLimit = 1 * time.Second
	var retries uint
	var retryTimer *time.Timer

	// XXX: Potential optimization is to set up a watch on the state
	// store's index table and only unblock via a trigger rather than
	// polling.
	for {
		// Get the states current index
		snapshotIndex, err := s.LatestIndex()
		if err != nil {
			return nil, fmt.Errorf("failed to determine state store's index: %w", err)
		}

		// We only need the FSM state to be as recent as the given index
		if snapshotIndex >= index {
			return s.Snapshot()
		}

		// Exponential back off
		retries++
		if retryTimer == nil {
			// First retry, start at baseline
			retryTimer = time.NewTimer(backoffBase)
		} else {
			// Subsequent retry, reset timer
			deadline := 1 << (2 * retries) * backoffBase
			if deadline > backoffLimit {
				deadline = backoffLimit
			}
			retryTimer.Reset(deadline)
		}

		select {
		case <-ctx.Done():
			return nil, ctx.Err()
		case <-retryTimer.C:
		}
	}
}

// Restore is used to optimize the efficiency of rebuilding
// state by minimizing the number of transactions and checking
// overhead.
func (s *StateStore) Restore() (*StateRestore, error) {
	txn := s.db.WriteTxnRestore()
	r := &StateRestore{
		txn: txn,
	}
	return r, nil
}

// AbandonCh returns a channel you can wait on to know if the state store was
// abandoned.
func (s *StateStore) AbandonCh() <-chan struct{} {
	return s.abandonCh
}

// Abandon is used to signal that the given state store has been abandoned.
// Calling this more than one time will panic.
func (s *StateStore) Abandon() {
	s.StopEventBroker()
	close(s.abandonCh)
}

// StopEventBroker calls the cancel func for the state stores event
// publisher. It should be called during server shutdown.
func (s *StateStore) StopEventBroker() {
	s.stopEventBroker()
}

// QueryFn is the definition of a function that can be used to implement a basic
// blocking query against the state store.
type QueryFn func(memdb.WatchSet, *StateStore) (resp interface{}, index uint64, err error)

// BlockingQuery takes a query function and runs the function until the minimum
// query index is met or until the passed context is cancelled.
func (s *StateStore) BlockingQuery(query QueryFn, minIndex uint64, ctx context.Context) (
	resp interface{}, index uint64, err error) {

RUN_QUERY:
	// We capture the state store and its abandon channel but pass a snapshot to
	// the blocking query function. We operate on the snapshot to allow separate
	// calls to the state store not all wrapped within the same transaction.
	abandonCh := s.AbandonCh()
	snap, _ := s.Snapshot()
	stateSnap := &snap.StateStore

	// We can skip all watch tracking if this isn't a blocking query.
	var ws memdb.WatchSet
	if minIndex > 0 {
		ws = memdb.NewWatchSet()

		// This channel will be closed if a snapshot is restored and the
		// whole state store is abandoned.
		ws.Add(abandonCh)
	}

	resp, index, err = query(ws, stateSnap)
	if err != nil {
		return nil, index, err
	}

	// We haven't reached the min-index yet.
	if minIndex > 0 && index <= minIndex {
		if err := ws.WatchCtx(ctx); err != nil {
			return nil, index, err
		}

		goto RUN_QUERY
	}

	return resp, index, nil
}

// UpsertPlanResults is used to upsert the results of a plan.
func (s *StateStore) UpsertPlanResults(msgType structs.MessageType, index uint64, results *structs.ApplyPlanResultsRequest) error {
	snapshot, err := s.Snapshot()
	if err != nil {
		return err
	}

	allocsStopped, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsStopped)
	if err != nil {
		return err
	}

	allocsPreempted, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsPreempted)
	if err != nil {
		return err
	}

	// COMPAT 0.11: Remove this denormalization when NodePreemptions is removed
	results.NodePreemptions, err = snapshot.DenormalizeAllocationSlice(results.NodePreemptions)
	if err != nil {
		return err
	}

	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Mark nodes as ineligible.
	for _, nodeID := range results.IneligibleNodes {
		s.logger.Warn("marking node as ineligible due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information", "node_id", nodeID)

		nodeEvent := structs.NewNodeEvent().
			SetSubsystem(structs.NodeEventSubsystemScheduler).
			SetMessage(NodeEligibilityEventPlanRejectThreshold)

		err := s.updateNodeEligibilityImpl(index, nodeID,
			structs.NodeSchedulingIneligible, results.UpdatedAt, nodeEvent, txn)
		if err != nil {
			return err
		}
	}

	// Upsert the newly created or updated deployment
	if results.Deployment != nil {
		if err := s.upsertDeploymentImpl(index, results.Deployment, txn); err != nil {
			return err
		}
	}

	// Update the status of deployments effected by the plan.
	if len(results.DeploymentUpdates) != 0 {
		s.upsertDeploymentUpdates(index, results.DeploymentUpdates, txn)
	}

	if results.EvalID != "" {
		// Update the modify index of the eval id
		if err := s.updateEvalModifyIndex(txn, index, results.EvalID); err != nil {
			return err
		}
	}

	numAllocs := 0
	if len(results.Alloc) > 0 || len(results.NodePreemptions) > 0 {
		// COMPAT 0.11: This branch will be removed, when Alloc is removed
		// Attach the job to all the allocations. It is pulled out in the payload to
		// avoid the redundancy of encoding, but should be denormalized prior to
		// being inserted into MemDB.
		addComputedAllocAttrs(results.Alloc, results.Job)
		numAllocs = len(results.Alloc) + len(results.NodePreemptions)
	} else {
		// Attach the job to all the allocations. It is pulled out in the payload to
		// avoid the redundancy of encoding, but should be denormalized prior to
		// being inserted into MemDB.
		addComputedAllocAttrs(results.AllocsUpdated, results.Job)
		numAllocs = len(allocsStopped) + len(results.AllocsUpdated) + len(allocsPreempted)
	}

	allocsToUpsert := make([]*structs.Allocation, 0, numAllocs)

	// COMPAT 0.11: Both these appends should be removed when Alloc and NodePreemptions are removed
	allocsToUpsert = append(allocsToUpsert, results.Alloc...)
	allocsToUpsert = append(allocsToUpsert, results.NodePreemptions...)

	allocsToUpsert = append(allocsToUpsert, allocsStopped...)
	allocsToUpsert = append(allocsToUpsert, results.AllocsUpdated...)
	allocsToUpsert = append(allocsToUpsert, allocsPreempted...)

	// handle upgrade path
	for _, alloc := range allocsToUpsert {
		alloc.Canonicalize()
	}

	if err := s.upsertAllocsImpl(index, allocsToUpsert, txn); err != nil {
		return err
	}

	// Upsert followup evals for allocs that were preempted
	for _, eval := range results.PreemptionEvals {
		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
			return err
		}
	}

	return txn.Commit()
}

// addComputedAllocAttrs adds the computed/derived attributes to the allocation.
// This method is used when an allocation is being denormalized.
func addComputedAllocAttrs(allocs []*structs.Allocation, job *structs.Job) {
	structs.DenormalizeAllocationJobs(job, allocs)

	// COMPAT(0.11): Remove in 0.11
	// Calculate the total resources of allocations. It is pulled out in the
	// payload to avoid encoding something that can be computed, but should be
	// denormalized prior to being inserted into MemDB.
	for _, alloc := range allocs {
		if alloc.Resources != nil {
			continue
		}

		alloc.Resources = new(structs.Resources)
		for _, task := range alloc.TaskResources {
			alloc.Resources.Add(task)
		}

		// Add the shared resources
		alloc.Resources.Add(alloc.SharedResources)
	}
}

// upsertDeploymentUpdates updates the deployments given the passed status
// updates.
func (s *StateStore) upsertDeploymentUpdates(index uint64, updates []*structs.DeploymentStatusUpdate, txn *txn) error {
	for _, u := range updates {
		if err := s.updateDeploymentStatusImpl(index, u, txn); err != nil {
			return err
		}
	}

	return nil
}

// UpsertJobSummary upserts a job summary into the state store.
func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Check if the job summary already exists
	existing, err := txn.First("job_summary", "id", jobSummary.Namespace, jobSummary.JobID)
	if err != nil {
		return fmt.Errorf("job summary lookup failed: %v", err)
	}

	// Setup the indexes correctly
	if existing != nil {
		jobSummary.CreateIndex = existing.(*structs.JobSummary).CreateIndex
		jobSummary.ModifyIndex = index
	} else {
		jobSummary.CreateIndex = index
		jobSummary.ModifyIndex = index
	}

	// Update the index
	if err := txn.Insert("job_summary", jobSummary); err != nil {
		return err
	}

	// Update the indexes table for job summary
	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// DeleteJobSummary deletes the job summary with the given ID. This is for
// testing purposes only.
func (s *StateStore) DeleteJobSummary(index uint64, namespace, id string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Delete the job summary
	if _, err := txn.DeleteAll("job_summary", "id", namespace, id); err != nil {
		return fmt.Errorf("deleting job summary failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// UpsertDeployment is used to insert or update a new deployment.
func (s *StateStore) UpsertDeployment(index uint64, deployment *structs.Deployment) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()
	if err := s.upsertDeploymentImpl(index, deployment, txn); err != nil {
		return err
	}
	return txn.Commit()
}

func (s *StateStore) upsertDeploymentImpl(index uint64, deployment *structs.Deployment, txn *txn) error {
	// Check if the deployment already exists
	existing, err := txn.First("deployment", "id", deployment.ID)
	if err != nil {
		return fmt.Errorf("deployment lookup failed: %v", err)
	}

	// Setup the indexes correctly
	if existing != nil {
		deployment.CreateIndex = existing.(*structs.Deployment).CreateIndex
		deployment.ModifyIndex = index
	} else {
		deployment.CreateIndex = index
		deployment.ModifyIndex = index
	}

	// Insert the deployment
	if err := txn.Insert("deployment", deployment); err != nil {
		return err
	}

	// Update the indexes table for deployment
	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// If the deployment is being marked as complete, set the job to stable.
	if deployment.Status == structs.DeploymentStatusSuccessful {
		if err := s.updateJobStabilityImpl(index, deployment.Namespace, deployment.JobID, deployment.JobVersion, true, txn); err != nil {
			return fmt.Errorf("failed to update job stability: %v", err)
		}
	}

	return nil
}

func (s *StateStore) Deployments(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var it memdb.ResultIterator
	var err error

	switch sort {
	case SortReverse:
		it, err = txn.GetReverse("deployment", "create")
	default:
		it, err = txn.Get("deployment", "create")
	}

	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

func (s *StateStore) DeploymentsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire deployments table
	iter, err := txn.Get("deployment", "namespace", namespace)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

func (s *StateStore) DeploymentsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var (
		it    memdb.ResultIterator
		err   error
		exact = terminate(namespace)
	)

	switch sort {
	case SortReverse:
		it, err = txn.GetReverse("deployment", "namespace_create_prefix", exact)
	default:
		it, err = txn.Get("deployment", "namespace_create_prefix", exact)
	}

	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

func (s *StateStore) DeploymentsByIDPrefix(ws memdb.WatchSet, namespace, deploymentID string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var iter memdb.ResultIterator
	var err error

	// Walk the entire deployments table
	switch sort {
	case SortReverse:
		iter, err = txn.GetReverse("deployment", "id_prefix", deploymentID)
	default:
		iter, err = txn.Get("deployment", "id_prefix", deploymentID)
	}
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	// Wrap the iterator in a filter
	wrap := memdb.NewFilterIterator(iter, deploymentNamespaceFilter(namespace))
	return wrap, nil
}

// deploymentNamespaceFilter returns a filter function that filters all
// deployment not in the given namespace.
func deploymentNamespaceFilter(namespace string) func(interface{}) bool {
	return func(raw interface{}) bool {
		d, ok := raw.(*structs.Deployment)
		if !ok {
			return true
		}

		return namespace != structs.AllNamespacesSentinel &&
			d.Namespace != namespace
	}
}

func (s *StateStore) DeploymentByID(ws memdb.WatchSet, deploymentID string) (*structs.Deployment, error) {
	txn := s.db.ReadTxn()
	return s.deploymentByIDImpl(ws, deploymentID, txn)
}

func (s *StateStore) deploymentByIDImpl(ws memdb.WatchSet, deploymentID string, txn *txn) (*structs.Deployment, error) {
	watchCh, existing, err := txn.FirstWatch("deployment", "id", deploymentID)
	if err != nil {
		return nil, fmt.Errorf("deployment lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.Deployment), nil
	}

	return nil, nil
}

func (s *StateStore) DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error) {
	txn := s.db.ReadTxn()

	var job *structs.Job
	// Read job from state store
	_, existing, err := txn.FirstWatch("jobs", "id", namespace, jobID)
	if err != nil {
		return nil, fmt.Errorf("job lookup failed: %v", err)
	}
	if existing != nil {
		job = existing.(*structs.Job)
	}

	// Get an iterator over the deployments
	iter, err := txn.Get("deployment", "job", namespace, jobID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Deployment
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		d := raw.(*structs.Deployment)

		// If the allocation belongs to a job with the same ID but a different
		// create index and we are not getting all the allocations whose Jobs
		// matches the same Job ID then we skip it
		if !all && job != nil && d.JobCreateIndex != job.CreateIndex {
			continue
		}
		out = append(out, d)
	}

	return out, nil
}

// LatestDeploymentByJobID returns the latest deployment for the given job. The
// latest is determined strictly by CreateIndex.
func (s *StateStore) LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the deployments
	iter, err := txn.Get("deployment", "job", namespace, jobID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out *structs.Deployment
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		d := raw.(*structs.Deployment)
		if out == nil || out.CreateIndex < d.CreateIndex {
			out = d
		}
	}

	return out, nil
}

// DeleteDeployment is used to delete a set of deployments by ID
func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	if len(deploymentIDs) == 0 {
		return nil
	}

	for _, deploymentID := range deploymentIDs {
		// Lookup the deployment
		existing, err := txn.First("deployment", "id", deploymentID)
		if err != nil {
			return fmt.Errorf("deployment lookup failed: %v", err)
		}
		if existing == nil {
			return fmt.Errorf("deployment not found")
		}

		// Delete the deployment
		if err := txn.Delete("deployment", existing); err != nil {
			return fmt.Errorf("deployment delete failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// UpsertScalingEvent is used to insert a new scaling event.
// Only the most recent JobTrackedScalingEvents will be kept.
func (s *StateStore) UpsertScalingEvent(index uint64, req *structs.ScalingEventRequest) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Get the existing events
	existing, err := txn.First("scaling_event", "id", req.Namespace, req.JobID)
	if err != nil {
		return fmt.Errorf("scaling event lookup failed: %v", err)
	}

	var jobEvents *structs.JobScalingEvents
	if existing != nil {
		jobEvents = existing.(*structs.JobScalingEvents)
	} else {
		jobEvents = &structs.JobScalingEvents{
			Namespace:     req.Namespace,
			JobID:         req.JobID,
			ScalingEvents: make(map[string][]*structs.ScalingEvent),
		}
	}

	jobEvents.ModifyIndex = index
	req.ScalingEvent.CreateIndex = index

	events := jobEvents.ScalingEvents[req.TaskGroup]
	// Prepend this latest event
	events = append(
		[]*structs.ScalingEvent{req.ScalingEvent},
		events...,
	)
	// Truncate older events
	if len(events) > structs.JobTrackedScalingEvents {
		events = events[0:structs.JobTrackedScalingEvents]
	}
	jobEvents.ScalingEvents[req.TaskGroup] = events

	// Insert the new event
	if err := txn.Insert("scaling_event", jobEvents); err != nil {
		return fmt.Errorf("scaling event insert failed: %v", err)
	}

	// Update the indexes table for scaling_event
	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// ScalingEvents returns an iterator over all the job scaling events
func (s *StateStore) ScalingEvents(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire scaling_event table
	iter, err := txn.Get("scaling_event", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

func (s *StateStore) ScalingEventsByJob(ws memdb.WatchSet, namespace, jobID string) (map[string][]*structs.ScalingEvent, uint64, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("scaling_event", "id", namespace, jobID)
	if err != nil {
		return nil, 0, fmt.Errorf("job scaling events lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		events := existing.(*structs.JobScalingEvents)
		return events.ScalingEvents, events.ModifyIndex, nil
	}
	return nil, 0, nil
}

// UpsertNode is used to register a node or update a node definition
// This is assumed to be triggered by the client, so we retain the value
// of drain/eligibility which is set by the scheduler.
func (s *StateStore) UpsertNode(msgType structs.MessageType, index uint64, node *structs.Node) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	err := upsertNodeTxn(txn, index, node)
	if err != nil {
		return nil
	}
	return txn.Commit()
}

func upsertNodeTxn(txn *txn, index uint64, node *structs.Node) error {
	// Check if the node already exists
	existing, err := txn.First("nodes", "id", node.ID)
	if err != nil {
		return fmt.Errorf("node lookup failed: %v", err)
	}

	// Setup the indexes correctly
	if existing != nil {
		exist := existing.(*structs.Node)
		node.CreateIndex = exist.CreateIndex
		node.ModifyIndex = index

		// Update last missed heartbeat if the node became unresponsive.
		if !exist.UnresponsiveStatus() && node.UnresponsiveStatus() {
			node.LastMissedHeartbeatIndex = index
		}

		// Retain node events that have already been set on the node
		node.Events = exist.Events

		// If we are transitioning from down, record the re-registration
		if exist.Status == structs.NodeStatusDown && node.Status != structs.NodeStatusDown {
			appendNodeEvents(index, node, []*structs.NodeEvent{
				structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
					SetMessage(NodeRegisterEventReregistered).
					SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))})
		}

		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
		node.LastDrain = exist.LastDrain                         // Retain the drain metadata

		// Retain the last index the node missed a heartbeat.
		if node.LastMissedHeartbeatIndex < exist.LastMissedHeartbeatIndex {
			node.LastMissedHeartbeatIndex = exist.LastMissedHeartbeatIndex
		}

		// Retain the last index the node updated its allocs.
		if node.LastAllocUpdateIndex < exist.LastAllocUpdateIndex {
			node.LastAllocUpdateIndex = exist.LastAllocUpdateIndex
		}
	} else {
		// Because this is the first time the node is being registered, we should
		// also create a node registration event
		nodeEvent := structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).
			SetMessage(NodeRegisterEventRegistered).
			SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))
		node.Events = []*structs.NodeEvent{nodeEvent}
		node.CreateIndex = index
		node.ModifyIndex = index
	}

	// Insert the node
	if err := txn.Insert("nodes", node); err != nil {
		return fmt.Errorf("node insert failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	if err := upsertCSIPluginsForNode(txn, node, index); err != nil {
		return fmt.Errorf("csi plugin update failed: %v", err)
	}

	return nil
}

// DeleteNode deregisters a batch of nodes
func (s *StateStore) DeleteNode(msgType structs.MessageType, index uint64, nodes []string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	err := deleteNodeTxn(txn, index, nodes)
	if err != nil {
		return nil
	}
	return txn.Commit()
}

func deleteNodeTxn(txn *txn, index uint64, nodes []string) error {
	if len(nodes) == 0 {
		return fmt.Errorf("node ids missing")
	}

	for _, nodeID := range nodes {
		existing, err := txn.First("nodes", "id", nodeID)
		if err != nil {
			return fmt.Errorf("node lookup failed: %s: %v", nodeID, err)
		}
		if existing == nil {
			return fmt.Errorf("node not found: %s", nodeID)
		}

		// Delete the node
		if err := txn.Delete("nodes", existing); err != nil {
			return fmt.Errorf("node delete failed: %s: %v", nodeID, err)
		}

		node := existing.(*structs.Node)
		if err := deleteNodeCSIPlugins(txn, node, index); err != nil {
			return fmt.Errorf("csi plugin delete failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// UpdateNodeStatus is used to update the status of a node
func (s *StateStore) UpdateNodeStatus(msgType structs.MessageType, index uint64, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	if err := s.updateNodeStatusTxn(txn, nodeID, status, updatedAt, event); err != nil {
		return err
	}

	return txn.Commit()
}

func (s *StateStore) updateNodeStatusTxn(txn *txn, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error {

	// Lookup the node
	existing, err := txn.First("nodes", "id", nodeID)
	if err != nil {
		return fmt.Errorf("node lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("node not found")
	}

	// Copy the existing node
	existingNode := existing.(*structs.Node)
	copyNode := existingNode.Copy()
	copyNode.StatusUpdatedAt = updatedAt

	// Add the event if given
	if event != nil {
		appendNodeEvents(txn.Index, copyNode, []*structs.NodeEvent{event})
	}

	// Update the status in the copy
	copyNode.Status = status
	copyNode.ModifyIndex = txn.Index

	// Update last missed heartbeat if the node became unresponsive or reset it
	// zero if the node became ready.
	if !existingNode.UnresponsiveStatus() && copyNode.UnresponsiveStatus() {
		copyNode.LastMissedHeartbeatIndex = txn.Index
	} else if existingNode.Status != structs.NodeStatusReady &&
		copyNode.Status == structs.NodeStatusReady {
		copyNode.LastMissedHeartbeatIndex = 0
	}

	// Insert the node
	if err := txn.Insert("nodes", copyNode); err != nil {
		return fmt.Errorf("node update failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"nodes", txn.Index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return nil
}

// BatchUpdateNodeDrain is used to update the drain of a node set of nodes.
// This is currently only called when node drain is completed by the drainer.
func (s *StateStore) BatchUpdateNodeDrain(msgType structs.MessageType, index uint64, updatedAt int64,
	updates map[string]*structs.DrainUpdate, events map[string]*structs.NodeEvent) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()
	for node, update := range updates {
		if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible, updatedAt,
			events[node], nil, "", true); err != nil {
			return err
		}
	}
	return txn.Commit()
}

// UpdateNodeDrain is used to update the drain of a node
func (s *StateStore) UpdateNodeDrain(msgType structs.MessageType, index uint64, nodeID string,
	drain *structs.DrainStrategy, markEligible bool, updatedAt int64,
	event *structs.NodeEvent, drainMeta map[string]string, accessorId string) error {

	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()
	if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible, updatedAt, event,
		drainMeta, accessorId, false); err != nil {

		return err
	}
	return txn.Commit()
}

func (s *StateStore) updateNodeDrainImpl(txn *txn, index uint64, nodeID string,
	drain *structs.DrainStrategy, markEligible bool, updatedAt int64,
	event *structs.NodeEvent, drainMeta map[string]string, accessorId string,
	drainCompleted bool) error {

	// Lookup the node
	existing, err := txn.First("nodes", "id", nodeID)
	if err != nil {
		return fmt.Errorf("node lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("node not found")
	}

	// Copy the existing node
	existingNode := existing.(*structs.Node)
	updatedNode := existingNode.Copy()
	updatedNode.StatusUpdatedAt = updatedAt

	// Add the event if given
	if event != nil {
		appendNodeEvents(index, updatedNode, []*structs.NodeEvent{event})
	}

	// Update the drain in the copy
	updatedNode.DrainStrategy = drain
	if drain != nil {
		updatedNode.SchedulingEligibility = structs.NodeSchedulingIneligible
	} else if markEligible {
		updatedNode.SchedulingEligibility = structs.NodeSchedulingEligible
	}

	// Update LastDrain
	updateTime := time.Unix(updatedAt, 0)

	// if drain strategy isn't set before or after, this wasn't a drain operation
	// in that case, we don't care about .LastDrain
	drainNoop := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy == nil
	// otherwise, when done with this method, updatedNode.LastDrain should be set
	// if starting a new drain operation, create a new LastDrain. otherwise, update the existing one.
	startedDraining := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy != nil
	if !drainNoop {
		if startedDraining {
			updatedNode.LastDrain = &structs.DrainMetadata{
				StartedAt: updateTime,
				Meta:      drainMeta,
			}
		} else if updatedNode.LastDrain == nil {
			// if already draining and LastDrain doesn't exist, we need to create a new one
			// this could happen if we upgraded to 1.1.x during a drain
			updatedNode.LastDrain = &structs.DrainMetadata{
				// we don't have sub-second accuracy on these fields, so truncate this
				StartedAt: time.Unix(existingNode.DrainStrategy.StartedAt.Unix(), 0),
				Meta:      drainMeta,
			}
		}

		updatedNode.LastDrain.UpdatedAt = updateTime

		// won't have new metadata on drain complete; keep the existing operator-provided metadata
		// also, keep existing if they didn't provide it
		if len(drainMeta) != 0 {
			updatedNode.LastDrain.Meta = drainMeta
		}

		// we won't have an accessor ID on drain complete, so don't overwrite the existing one
		if accessorId != "" {
			updatedNode.LastDrain.AccessorID = accessorId
		}

		if updatedNode.DrainStrategy != nil {
			updatedNode.LastDrain.Status = structs.DrainStatusDraining
		} else if drainCompleted {
			updatedNode.LastDrain.Status = structs.DrainStatusComplete
		} else {
			updatedNode.LastDrain.Status = structs.DrainStatusCanceled
		}
	}

	updatedNode.ModifyIndex = index

	// Insert the node
	if err := txn.Insert("nodes", updatedNode); err != nil {
		return fmt.Errorf("node update failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// UpdateNodeEligibility is used to update the scheduling eligibility of a node
func (s *StateStore) UpdateNodeEligibility(msgType structs.MessageType, index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()
	if err := s.updateNodeEligibilityImpl(index, nodeID, eligibility, updatedAt, event, txn); err != nil {
		return err
	}
	return txn.Commit()
}

func (s *StateStore) updateNodeEligibilityImpl(index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent, txn *txn) error {
	// Lookup the node
	existing, err := txn.First("nodes", "id", nodeID)
	if err != nil {
		return fmt.Errorf("node lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("node not found")
	}

	// Copy the existing node
	existingNode := existing.(*structs.Node)
	copyNode := existingNode.Copy()
	copyNode.StatusUpdatedAt = updatedAt

	// Add the event if given
	if event != nil {
		appendNodeEvents(index, copyNode, []*structs.NodeEvent{event})
	}

	// Check if this is a valid action
	if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible {
		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
	}

	// Update the eligibility in the copy
	copyNode.SchedulingEligibility = eligibility
	copyNode.ModifyIndex = index

	// Insert the node
	if err := txn.Insert("nodes", copyNode); err != nil {
		return fmt.Errorf("node update failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// UpsertNodeEvents adds the node events to the nodes, rotating events as
// necessary.
func (s *StateStore) UpsertNodeEvents(msgType structs.MessageType, index uint64, nodeEvents map[string][]*structs.NodeEvent) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	for nodeID, events := range nodeEvents {
		if err := s.upsertNodeEvents(index, nodeID, events, txn); err != nil {
			return err
		}
	}

	return txn.Commit()
}

// upsertNodeEvent upserts a node event for a respective node. It also maintains
// that a fixed number of node events are ever stored simultaneously, deleting
// older events once this bound has been reached.
func (s *StateStore) upsertNodeEvents(index uint64, nodeID string, events []*structs.NodeEvent, txn *txn) error {
	// Lookup the node
	existing, err := txn.First("nodes", "id", nodeID)
	if err != nil {
		return fmt.Errorf("node lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("node not found")
	}

	// Copy the existing node
	existingNode := existing.(*structs.Node)
	copyNode := existingNode.Copy()
	appendNodeEvents(index, copyNode, events)

	// Insert the node
	if err := txn.Insert("nodes", copyNode); err != nil {
		return fmt.Errorf("node update failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// appendNodeEvents is a helper that takes a node and new events and appends
// them, pruning older events as needed.
func appendNodeEvents(index uint64, node *structs.Node, events []*structs.NodeEvent) {
	// Add the events, updating the indexes
	for _, e := range events {
		e.CreateIndex = index
		node.Events = append(node.Events, e)
	}

	// Keep node events pruned to not exceed the max allowed
	if l := len(node.Events); l > structs.MaxRetainedNodeEvents {
		delta := l - structs.MaxRetainedNodeEvents
		node.Events = node.Events[delta:]
	}
}

// upsertCSIPluginsForNode indexes csi plugins for volume retrieval, with health. It's called
// on upsertNodeEvents, so that event driven health changes are updated
func upsertCSIPluginsForNode(txn *txn, node *structs.Node, index uint64) error {

	upsertFn := func(info *structs.CSIInfo) error {
		raw, err := txn.First("csi_plugins", "id", info.PluginID)
		if err != nil {
			return fmt.Errorf("csi_plugin lookup error: %s %v", info.PluginID, err)
		}

		var plug *structs.CSIPlugin
		if raw != nil {
			plug = raw.(*structs.CSIPlugin).Copy()
		} else {
			if !info.Healthy {
				// we don't want to create new plugins for unhealthy
				// allocs, otherwise we'd recreate the plugin when we
				// get the update for the alloc becoming terminal
				return nil
			}
			plug = structs.NewCSIPlugin(info.PluginID, index)
		}

		// the plugin may have been created by the job being updated, in which case
		// this data will not be configured, it's only available to the fingerprint
		// system
		plug.Provider = info.Provider
		plug.Version = info.ProviderVersion

		err = plug.AddPlugin(node.ID, info)
		if err != nil {
			return err
		}

		plug.ModifyIndex = index

		err = txn.Insert("csi_plugins", plug)
		if err != nil {
			return fmt.Errorf("csi_plugins insert error: %v", err)
		}

		return nil
	}

	inUseController := map[string]struct{}{}
	inUseNode := map[string]struct{}{}

	for _, info := range node.CSIControllerPlugins {
		err := upsertFn(info)
		if err != nil {
			return err
		}
		inUseController[info.PluginID] = struct{}{}
	}

	for _, info := range node.CSINodePlugins {
		err := upsertFn(info)
		if err != nil {
			return err
		}
		inUseNode[info.PluginID] = struct{}{}
	}

	// remove the client node from any plugin that's not
	// running on it.
	iter, err := txn.Get("csi_plugins", "id")
	if err != nil {
		return fmt.Errorf("csi_plugins lookup failed: %v", err)
	}
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		plug, ok := raw.(*structs.CSIPlugin)
		if !ok {
			continue
		}
		plug = plug.Copy()

		var hadDelete bool
		if _, ok := inUseController[plug.ID]; !ok {
			if _, asController := plug.Controllers[node.ID]; asController {
				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeController)
				if err != nil {
					return err
				}
				hadDelete = true
			}
		}
		if _, ok := inUseNode[plug.ID]; !ok {
			if _, asNode := plug.Nodes[node.ID]; asNode {
				err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeNode)
				if err != nil {
					return err
				}
				hadDelete = true
			}
		}
		// we check this flag both for performance and to make sure we
		// don't delete a plugin when registering a node plugin but
		// no controller
		if hadDelete {
			err = updateOrGCPlugin(index, txn, plug)
			if err != nil {
				return err
			}
		}
	}

	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// deleteNodeCSIPlugins cleans up CSIInfo node health status, called in DeleteNode
func deleteNodeCSIPlugins(txn *txn, node *structs.Node, index uint64) error {
	if len(node.CSIControllerPlugins) == 0 && len(node.CSINodePlugins) == 0 {
		return nil
	}

	names := map[string]struct{}{}
	for _, info := range node.CSIControllerPlugins {
		names[info.PluginID] = struct{}{}
	}
	for _, info := range node.CSINodePlugins {
		names[info.PluginID] = struct{}{}
	}

	for id := range names {
		raw, err := txn.First("csi_plugins", "id", id)
		if err != nil {
			return fmt.Errorf("csi_plugins lookup error %s: %v", id, err)
		}
		if raw == nil {
			// plugin may have been deregistered but we didn't
			// update the fingerprint yet
			continue
		}

		plug := raw.(*structs.CSIPlugin).Copy()
		err = plug.DeleteNode(node.ID)
		if err != nil {
			return err
		}
		err = updateOrGCPlugin(index, txn, plug)
		if err != nil {
			return err
		}
	}

	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// updateOrGCPlugin updates a plugin but will delete it if the plugin is empty
func updateOrGCPlugin(index uint64, txn Txn, plug *structs.CSIPlugin) error {
	if plug.IsEmpty() {
		err := txn.Delete("csi_plugins", plug)
		if err != nil {
			return fmt.Errorf("csi_plugins delete error: %v", err)
		}
	} else {
		plug.ModifyIndex = index
		err := txn.Insert("csi_plugins", plug)
		if err != nil {
			return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err)
		}
	}
	return nil
}

// deleteJobFromPlugins removes the allocations of this job from any plugins the job is
// running, possibly deleting the plugin if it's no longer in use. It's called in DeleteJobTxn
func (s *StateStore) deleteJobFromPlugins(index uint64, txn Txn, job *structs.Job) error {
	ws := memdb.NewWatchSet()
	summary, err := s.JobSummaryByID(ws, job.Namespace, job.ID)
	if err != nil {
		return fmt.Errorf("error getting job summary: %v", err)
	}

	allocs, err := s.AllocsByJob(ws, job.Namespace, job.ID, false)
	if err != nil {
		return fmt.Errorf("error getting allocations: %v", err)
	}

	type pair struct {
		pluginID string
		alloc    *structs.Allocation
	}

	plugAllocs := []*pair{}
	found := map[string]struct{}{}

	// Find plugins for allocs that belong to this job
	for _, a := range allocs {
		tg := a.Job.LookupTaskGroup(a.TaskGroup)
		found[tg.Name] = struct{}{}
		for _, t := range tg.Tasks {
			if t.CSIPluginConfig == nil {
				continue
			}
			plugAllocs = append(plugAllocs, &pair{
				pluginID: t.CSIPluginConfig.ID,
				alloc:    a,
			})
		}
	}

	// Find any plugins that do not yet have allocs for this job
	for _, tg := range job.TaskGroups {
		if _, ok := found[tg.Name]; ok {
			continue
		}

		for _, t := range tg.Tasks {
			if t.CSIPluginConfig == nil {
				continue
			}
			plugAllocs = append(plugAllocs, &pair{
				pluginID: t.CSIPluginConfig.ID,
			})
		}
	}

	plugins := map[string]*structs.CSIPlugin{}

	for _, x := range plugAllocs {
		plug, ok := plugins[x.pluginID]

		if !ok {
			plug, err = s.CSIPluginByIDTxn(txn, nil, x.pluginID)
			if err != nil {
				return fmt.Errorf("error getting plugin: %s, %v", x.pluginID, err)
			}
			if plug == nil {
				// plugin was never successfully registered or has been
				// GC'd out from under us
				continue
			}
			// only copy once, so we update the same plugin on each alloc
			plugins[x.pluginID] = plug.Copy()
			plug = plugins[x.pluginID]
		}

		if x.alloc == nil {
			continue
		}
		err := plug.DeleteAlloc(x.alloc.ID, x.alloc.NodeID)
		if err != nil {
			return err
		}
	}

	for _, plug := range plugins {
		plug.DeleteJob(job, summary)
		err = updateOrGCPlugin(index, txn, plug)
		if err != nil {
			return err
		}
	}

	if len(plugins) > 0 {
		if err = txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}

	return nil
}

// NodeByID is used to lookup a node by ID
func (s *StateStore) NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("nodes", "id", nodeID)
	if err != nil {
		return nil, fmt.Errorf("node lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.Node), nil
	}
	return nil, nil
}

// NodesByIDPrefix is used to lookup nodes by prefix
func (s *StateStore) NodesByIDPrefix(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("nodes", "id_prefix", nodeID)
	if err != nil {
		return nil, fmt.Errorf("node lookup failed: %v", err)
	}
	ws.Add(iter.WatchCh())

	return iter, nil
}

// NodeBySecretID is used to lookup a node by SecretID
func (s *StateStore) NodeBySecretID(ws memdb.WatchSet, secretID string) (*structs.Node, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("nodes", "secret_id", secretID)
	if err != nil {
		return nil, fmt.Errorf("node lookup by SecretID failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.Node), nil
	}
	return nil, nil
}

// Nodes returns an iterator over all the nodes
func (s *StateStore) Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire nodes table
	iter, err := txn.Get("nodes", "id")
	if err != nil {
		return nil, err
	}
	ws.Add(iter.WatchCh())
	return iter, nil
}

// UpsertJob is used to register a job or update a job definition
func (s *StateStore) UpsertJob(msgType structs.MessageType, index uint64, sub *structs.JobSubmission, job *structs.Job) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()
	if err := s.upsertJobImpl(index, sub, job, false, txn); err != nil {
		return err
	}
	return txn.Commit()
}

// UpsertJobTxn is used to register a job or update a job definition, like UpsertJob,
// but in a transaction.  Useful for when making multiple modifications atomically
func (s *StateStore) UpsertJobTxn(index uint64, sub *structs.JobSubmission, job *structs.Job, txn Txn) error {
	return s.upsertJobImpl(index, sub, job, false, txn)
}

// upsertJobImpl is the implementation for registering a job or updating a job definition
func (s *StateStore) upsertJobImpl(index uint64, sub *structs.JobSubmission, job *structs.Job, keepVersion bool, txn *txn) error {
	// Assert the namespace exists
	if exists, err := s.namespaceExists(txn, job.Namespace); err != nil {
		return err
	} else if !exists {
		return fmt.Errorf("job %q is in nonexistent namespace %q", job.ID, job.Namespace)
	}

	// Check if the job already exists
	existing, err := txn.First("jobs", "id", job.Namespace, job.ID)
	var existingJob *structs.Job
	if err != nil {
		return fmt.Errorf("job lookup failed: %v", err)
	}

	// Setup the indexes correctly
	if existing != nil {
		job.CreateIndex = existing.(*structs.Job).CreateIndex
		job.ModifyIndex = index

		existingJob = existing.(*structs.Job)

		// Bump the version unless asked to keep it. This should only be done
		// when changing an internal field such as Stable. A spec change should
		// always come with a version bump
		if !keepVersion {
			job.JobModifyIndex = index
			if job.Version <= existingJob.Version {
				job.Version = existingJob.Version + 1
			}
		}

		// Compute the job status
		var err error
		job.Status, err = s.getJobStatus(txn, job, false)
		if err != nil {
			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
		}
	} else {
		job.CreateIndex = index
		job.ModifyIndex = index
		job.JobModifyIndex = index

		if err := s.setJobStatus(index, txn, job, false, ""); err != nil {
			return fmt.Errorf("setting job status for %q failed: %v", job.ID, err)
		}

		// Have to get the job again since it could have been updated
		updated, err := txn.First("jobs", "id", job.Namespace, job.ID)
		if err != nil {
			return fmt.Errorf("job lookup failed: %v", err)
		}
		if updated != nil {
			job = updated.(*structs.Job)
		}
	}

	if err := s.updateSummaryWithJob(index, job, txn); err != nil {
		return fmt.Errorf("unable to create job summary: %v", err)
	}

	if err := s.upsertJobVersion(index, job, txn); err != nil {
		return fmt.Errorf("unable to upsert job into job_version table: %v", err)
	}

	if err := s.updateJobScalingPolicies(index, job, txn); err != nil {
		return fmt.Errorf("unable to update job scaling policies: %v", err)
	}

	if err := s.updateJobRecommendations(index, txn, existingJob, job); err != nil {
		return fmt.Errorf("unable to update job recommendations: %v", err)
	}

	if err := s.updateJobCSIPlugins(index, job, existingJob, txn); err != nil {
		return fmt.Errorf("unable to update job csi plugins: %v", err)
	}

	if err := s.updateJobSubmission(index, sub, job.Namespace, job.ID, job.Version, txn); err != nil {
		return fmt.Errorf("unable to update job submission: %v", err)
	}

	// Insert the job
	if err := txn.Insert("jobs", job); err != nil {
		return fmt.Errorf("job insert failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// DeleteJob is used to deregister a job
func (s *StateStore) DeleteJob(index uint64, namespace, jobID string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	err := s.DeleteJobTxn(index, namespace, jobID, txn)
	if err == nil {
		return txn.Commit()
	}
	return err
}

// DeleteJobTxn is used to deregister a job, like DeleteJob,
// but in a transaction.  Useful for when making multiple modifications atomically
func (s *StateStore) DeleteJobTxn(index uint64, namespace, jobID string, txn Txn) error {
	// Lookup the node
	existing, err := txn.First("jobs", "id", namespace, jobID)
	if err != nil {
		return fmt.Errorf("job lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("job not found")
	}

	// Check if we should update a parent job summary
	job := existing.(*structs.Job)
	if job.ParentID != "" {
		summaryRaw, err := txn.First("job_summary", "id", namespace, job.ParentID)
		if err != nil {
			return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
		}

		// Only continue if the summary exists. It could not exist if the parent
		// job was removed
		if summaryRaw != nil {
			existing := summaryRaw.(*structs.JobSummary)
			pSummary := existing.Copy()
			if pSummary.Children != nil {

				modified := false
				switch job.Status {
				case structs.JobStatusPending:
					pSummary.Children.Pending--
					pSummary.Children.Dead++
					modified = true
				case structs.JobStatusRunning:
					pSummary.Children.Running--
					pSummary.Children.Dead++
					modified = true
				case structs.JobStatusDead:
				default:
					return fmt.Errorf("unknown old job status %q", job.Status)
				}

				if modified {
					// Update the modify index
					pSummary.ModifyIndex = index

					// Insert the summary
					if err := txn.Insert("job_summary", pSummary); err != nil {
						return fmt.Errorf("job summary insert failed: %v", err)
					}
					if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
						return fmt.Errorf("index update failed: %v", err)
					}
				}
			}
		}
	}

	// Delete the job
	if err := txn.Delete("jobs", existing); err != nil {
		return fmt.Errorf("job delete failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// Delete the job versions
	if err := s.deleteJobVersions(index, job, txn); err != nil {
		return err
	}

	// Cleanup plugins registered by this job, before we delete the summary
	err = s.deleteJobFromPlugins(index, txn, job)
	if err != nil {
		return fmt.Errorf("deleting job from plugin: %v", err)
	}

	// Delete the job summary
	if _, err = txn.DeleteAll("job_summary", "id", namespace, jobID); err != nil {
		return fmt.Errorf("deleting job summary failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// Delete the job submission
	if err := s.deleteJobSubmission(job, txn); err != nil {
		return fmt.Errorf("deleting job submission failed: %v", err)
	}

	// Delete any remaining job scaling policies
	if err := s.deleteJobScalingPolicies(index, job, txn); err != nil {
		return fmt.Errorf("deleting job scaling policies failed: %v", err)
	}

	// Delete any job recommendations
	if err := s.deleteRecommendationsByJob(index, txn, job); err != nil {
		return fmt.Errorf("deleting job recommendatons failed: %v", err)
	}

	// Delete the scaling events
	if _, err = txn.DeleteAll("scaling_event", "id", namespace, jobID); err != nil {
		return fmt.Errorf("deleting job scaling events failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// deleteJobScalingPolicies deletes any scaling policies associated with the job
func (s *StateStore) deleteJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error {
	iter, err := s.ScalingPoliciesByJobTxn(nil, job.Namespace, job.ID, txn)
	if err != nil {
		return fmt.Errorf("getting job scaling policies for deletion failed: %v", err)
	}

	// Put them into a slice so there are no safety concerns while actually
	// performing the deletes
	policies := []interface{}{}
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		policies = append(policies, raw)
	}

	// Do the deletes
	for _, p := range policies {
		if err := txn.Delete("scaling_policy", p); err != nil {
			return fmt.Errorf("deleting scaling policy failed: %v", err)
		}
	}

	if len(policies) > 0 {
		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}
	return nil
}

func (s *StateStore) deleteJobSubmission(job *structs.Job, txn *txn) error {
	_, err := txn.DeleteAll("job_submission", "by_jobID", job.Namespace, job.ID)
	return err
}

// deleteJobVersions deletes all versions of the given job.
func (s *StateStore) deleteJobVersions(index uint64, job *structs.Job, txn *txn) error {
	iter, err := txn.Get("job_version", "id_prefix", job.Namespace, job.ID)
	if err != nil {
		return err
	}

	// Put them into a slice so there are no safety concerns while actually
	// performing the deletes
	jobs := []*structs.Job{}
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		// Ensure the ID is an exact match
		j := raw.(*structs.Job)
		if j.ID != job.ID {
			continue
		}

		jobs = append(jobs, j)
	}

	// Do the deletes
	for _, j := range jobs {
		if err := txn.Delete("job_version", j); err != nil {
			return fmt.Errorf("deleting job versions failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// upsertJobVersion inserts a job into its historic version table and limits the
// number of job versions that are tracked.
func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *txn) error {
	// Insert the job
	if err := txn.Insert("job_version", job); err != nil {
		return fmt.Errorf("failed to insert job into job_version table: %v", err)
	}

	if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// Get all the historic jobs for this ID
	all, err := s.jobVersionByID(txn, nil, job.Namespace, job.ID)
	if err != nil {
		return fmt.Errorf("failed to look up job versions for %q: %v", job.ID, err)
	}

	// If we are below the limit there is no GCing to be done
	if len(all) <= structs.JobTrackedVersions {
		return nil
	}

	// We have to delete a historic job to make room.
	// Find index of the highest versioned stable job
	stableIdx := -1
	for i, j := range all {
		if j.Stable {
			stableIdx = i
			break
		}
	}

	// If the stable job is the oldest version, do a swap to bring it into the
	// keep set.
	max := structs.JobTrackedVersions
	if stableIdx == max {
		all[max-1], all[max] = all[max], all[max-1]
	}

	// Delete the job outside of the set that are being kept.
	d := all[max]
	if err := txn.Delete("job_version", d); err != nil {
		return fmt.Errorf("failed to delete job %v (%d) from job_version", d.ID, d.Version)
	}

	return nil
}

// JobSubmission returns the original HCL/Variables context of a job, if available.
//
// Note: it is a normal case for the submission context to be unavailable, in which case
// nil is returned with no error.
func (s *StateStore) JobSubmission(ws memdb.WatchSet, namespace, jobName string, version uint64) (*structs.JobSubmission, error) {
	txn := s.db.ReadTxn()
	return s.jobSubmission(ws, namespace, jobName, version, txn)
}

func (s *StateStore) jobSubmission(ws memdb.WatchSet, namespace, jobName string, version uint64, txn Txn) (*structs.JobSubmission, error) {
	watchCh, existing, err := txn.FirstWatch("job_submission", "id", namespace, jobName, version)
	if err != nil {
		return nil, fmt.Errorf("job submission lookup failed: %v", err)
	}
	ws.Add(watchCh)
	if existing != nil {
		return existing.(*structs.JobSubmission), nil
	}
	return nil, nil
}

// JobByID is used to lookup a job by its ID. JobByID returns the current/latest job
// version.
func (s *StateStore) JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error) {
	txn := s.db.ReadTxn()
	return s.JobByIDTxn(ws, namespace, id, txn)
}

// JobByIDTxn is used to lookup a job by its ID, like  JobByID. JobByID returns the job version
// accessible through in the transaction
func (s *StateStore) JobByIDTxn(ws memdb.WatchSet, namespace, id string, txn Txn) (*structs.Job, error) {
	watchCh, existing, err := txn.FirstWatch("jobs", "id", namespace, id)
	if err != nil {
		return nil, fmt.Errorf("job lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.Job), nil
	}
	return nil, nil
}

// JobsByIDPrefix is used to lookup a job by prefix. If querying all namespaces
// the prefix will not be filtered by an index.
func (s *StateStore) JobsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
	if namespace == structs.AllNamespacesSentinel {
		return s.jobsByIDPrefixAllNamespaces(ws, id)
	}

	txn := s.db.ReadTxn()

	iter, err := txn.Get("jobs", "id_prefix", namespace, id)
	if err != nil {
		return nil, fmt.Errorf("job lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

func (s *StateStore) jobsByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire jobs table
	iter, err := txn.Get("jobs", "id")

	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	// Filter the iterator by ID prefix
	f := func(raw interface{}) bool {
		job, ok := raw.(*structs.Job)
		if !ok {
			return true
		}
		return !strings.HasPrefix(job.ID, prefix)
	}
	wrap := memdb.NewFilterIterator(iter, f)
	return wrap, nil
}

// JobVersionsByID returns all the tracked versions of a job.
func (s *StateStore) JobVersionsByID(ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
	txn := s.db.ReadTxn()

	return s.jobVersionByID(txn, ws, namespace, id)
}

// jobVersionByID is the underlying implementation for retrieving all tracked
// versions of a job and is called under an existing transaction. A watch set
// can optionally be passed in to add the job histories to the watch set.
func (s *StateStore) jobVersionByID(txn *txn, ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) {
	// Get all the historic jobs for this ID
	iter, err := txn.Get("job_version", "id_prefix", namespace, id)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var all []*structs.Job
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		// Ensure the ID is an exact match
		j := raw.(*structs.Job)
		if j.ID != id {
			continue
		}

		all = append(all, j)
	}

	// Sort in reverse order so that the highest version is first
	sort.Slice(all, func(i, j int) bool {
		return all[i].Version > all[j].Version
	})

	return all, nil
}

// JobByIDAndVersion returns the job identified by its ID and Version. The
// passed watchset may be nil.
func (s *StateStore) JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error) {
	txn := s.db.ReadTxn()
	return s.jobByIDAndVersionImpl(ws, namespace, id, version, txn)
}

// jobByIDAndVersionImpl returns the job identified by its ID and Version. The
// passed watchset may be nil.
func (s *StateStore) jobByIDAndVersionImpl(ws memdb.WatchSet, namespace, id string,
	version uint64, txn *txn) (*structs.Job, error) {

	watchCh, existing, err := txn.FirstWatch("job_version", "id", namespace, id, version)
	if err != nil {
		return nil, err
	}

	ws.Add(watchCh)

	if existing != nil {
		job := existing.(*structs.Job)
		return job, nil
	}

	return nil, nil
}

func (s *StateStore) JobVersions(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire deployments table
	iter, err := txn.Get("job_version", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

// Jobs returns an iterator over all the jobs
func (s *StateStore) Jobs(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire jobs table
	iter, err := txn.Get("jobs", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// JobsByNamespace returns an iterator over all the jobs for the given namespace
func (s *StateStore) JobsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()
	return s.jobsByNamespaceImpl(ws, namespace, txn)
}

// jobsByNamespaceImpl returns an iterator over all the jobs for the given namespace
func (s *StateStore) jobsByNamespaceImpl(ws memdb.WatchSet, namespace string, txn *txn) (memdb.ResultIterator, error) {
	// Walk the entire jobs table
	iter, err := txn.Get("jobs", "id_prefix", namespace, "")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs.
func (s *StateStore) JobsByPeriodic(ws memdb.WatchSet, periodic bool) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("jobs", "periodic", periodic)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// JobsByScheduler returns an iterator over all the jobs with the specific
// scheduler type.
func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Return an iterator for jobs with the specific type.
	iter, err := txn.Get("jobs", "type", schedulerType)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// JobsByGC returns an iterator over all jobs eligible or ineligible for garbage
// collection.
func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("jobs", "gc", gc)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// JobSummaryByID returns a job summary object which matches a specific id.
func (s *StateStore) JobSummaryByID(ws memdb.WatchSet, namespace, jobID string) (*structs.JobSummary, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("job_summary", "id", namespace, jobID)
	if err != nil {
		return nil, err
	}

	ws.Add(watchCh)

	if existing != nil {
		summary := existing.(*structs.JobSummary)
		return summary, nil
	}

	return nil, nil
}

// JobSummaries walks the entire job summary table and returns all the job
// summary objects
func (s *StateStore) JobSummaries(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("job_summary", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// JobSummaryByPrefix is used to look up Job Summary by id prefix
func (s *StateStore) JobSummaryByPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("job_summary", "id_prefix", namespace, id)
	if err != nil {
		return nil, fmt.Errorf("job_summary lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// UpsertCSIVolume inserts a volume in the state store.
func (s *StateStore) UpsertCSIVolume(index uint64, volumes []*structs.CSIVolume) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	for _, v := range volumes {
		if exists, err := s.namespaceExists(txn, v.Namespace); err != nil {
			return err
		} else if !exists {
			return fmt.Errorf("volume %s is in nonexistent namespace %s", v.ID, v.Namespace)
		}

		obj, err := txn.First("csi_volumes", "id", v.Namespace, v.ID)
		if err != nil {
			return fmt.Errorf("volume existence check error: %v", err)
		}
		if obj != nil {
			// Allow some properties of a volume to be updated in place, but
			// prevent accidentally overwriting important properties, or
			// overwriting a volume in use
			old := obj.(*structs.CSIVolume)
			if old.ExternalID != v.ExternalID ||
				old.PluginID != v.PluginID ||
				old.Provider != v.Provider {
				return fmt.Errorf("volume identity cannot be updated: %s", v.ID)
			}
			s.CSIVolumeDenormalize(nil, old.Copy())
			if old.InUse() {
				return fmt.Errorf("volume cannot be updated while in use")
			}

			v.CreateIndex = old.CreateIndex
			v.ModifyIndex = index
		} else {
			v.CreateIndex = index
			v.ModifyIndex = index
		}

		// Allocations are copy on write, so we want to keep the Allocation ID
		// but we need to clear the pointer so that we don't store it when we
		// write the volume to the state store. We'll get it from the db in
		// denormalize.
		for allocID := range v.ReadAllocs {
			v.ReadAllocs[allocID] = nil
		}
		for allocID := range v.WriteAllocs {
			v.WriteAllocs[allocID] = nil
		}

		err = txn.Insert("csi_volumes", v)
		if err != nil {
			return fmt.Errorf("volume insert: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// CSIVolumes returns the unfiltered list of all volumes. Caller should
// snapshot if it wants to also denormalize the plugins.
func (s *StateStore) CSIVolumes(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	iter, err := txn.Get("csi_volumes", "id")
	if err != nil {
		return nil, fmt.Errorf("csi_volumes lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// CSIVolumeByID is used to lookup a single volume. Returns a copy of the
// volume because its plugins and allocations are denormalized to provide
// accurate Health.
func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*structs.CSIVolume, error) {
	txn := s.db.ReadTxn()

	watchCh, obj, err := txn.FirstWatch("csi_volumes", "id", namespace, id)
	if err != nil {
		return nil, fmt.Errorf("volume lookup failed for %s: %v", id, err)
	}
	ws.Add(watchCh)

	if obj == nil {
		return nil, nil
	}
	vol := obj.(*structs.CSIVolume)

	// we return the volume with the plugins denormalized by default,
	// because the scheduler needs them for feasibility checking
	return s.csiVolumeDenormalizePluginsTxn(txn, vol.Copy())
}

// CSIVolumesByPluginID looks up csi_volumes by pluginID. Caller should
// snapshot if it wants to also denormalize the plugins.
func (s *StateStore) CSIVolumesByPluginID(ws memdb.WatchSet, namespace, prefix, pluginID string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("csi_volumes", "plugin_id", pluginID)
	if err != nil {
		return nil, fmt.Errorf("volume lookup failed: %v", err)
	}

	// Filter the iterator by namespace
	f := func(raw interface{}) bool {
		v, ok := raw.(*structs.CSIVolume)
		if !ok {
			return false
		}
		return v.Namespace != namespace && strings.HasPrefix(v.ID, prefix)
	}

	wrap := memdb.NewFilterIterator(iter, f)
	return wrap, nil
}

// CSIVolumesByIDPrefix supports search. Caller should snapshot if it wants to
// also denormalize the plugins. If using a prefix with the wildcard namespace,
// the results will not use the index prefix.
func (s *StateStore) CSIVolumesByIDPrefix(ws memdb.WatchSet, namespace, volumeID string) (memdb.ResultIterator, error) {
	if namespace == structs.AllNamespacesSentinel {
		return s.csiVolumeByIDPrefixAllNamespaces(ws, volumeID)
	}

	txn := s.db.ReadTxn()

	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, volumeID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

func (s *StateStore) csiVolumeByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire csi_volumes table
	iter, err := txn.Get("csi_volumes", "id")

	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	// Filter the iterator by ID prefix
	f := func(raw interface{}) bool {
		v, ok := raw.(*structs.CSIVolume)
		if !ok {
			return false
		}
		return !strings.HasPrefix(v.ID, prefix)
	}
	wrap := memdb.NewFilterIterator(iter, f)
	return wrap, nil
}

// CSIVolumesByNodeID looks up CSIVolumes in use on a node. Caller should
// snapshot if it wants to also denormalize the plugins.
func (s *StateStore) CSIVolumesByNodeID(ws memdb.WatchSet, prefix, nodeID string) (memdb.ResultIterator, error) {
	allocs, err := s.AllocsByNode(ws, nodeID)
	if err != nil {
		return nil, fmt.Errorf("alloc lookup failed: %v", err)
	}

	// Find volume ids for CSI volumes in running allocs, or allocs that we desire to run
	ids := map[string]string{} // Map volumeID to Namespace
	for _, a := range allocs {
		tg := a.Job.LookupTaskGroup(a.TaskGroup)

		if !(a.DesiredStatus == structs.AllocDesiredStatusRun ||
			a.ClientStatus == structs.AllocClientStatusRunning) ||
			len(tg.Volumes) == 0 {
			continue
		}

		for _, v := range tg.Volumes {
			if v.Type != structs.VolumeTypeCSI {
				continue
			}
			ids[v.Source] = a.Namespace
		}
	}

	// Lookup the raw CSIVolumes to match the other list interfaces
	iter := NewSliceIterator()
	txn := s.db.ReadTxn()
	for id, namespace := range ids {
		if strings.HasPrefix(id, prefix) {
			watchCh, raw, err := txn.FirstWatch("csi_volumes", "id", namespace, id)
			if err != nil {
				return nil, fmt.Errorf("volume lookup failed: %s %v", id, err)
			}
			ws.Add(watchCh)
			iter.Add(raw)
		}
	}

	return iter, nil
}

// CSIVolumesByNamespace looks up the entire csi_volumes table
func (s *StateStore) CSIVolumesByNamespace(ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	return s.csiVolumesByNamespaceImpl(txn, ws, namespace, prefix)
}

func (s *StateStore) csiVolumesByNamespaceImpl(txn *txn, ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) {

	iter, err := txn.Get("csi_volumes", "id_prefix", namespace, prefix)
	if err != nil {
		return nil, fmt.Errorf("volume lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// CSIVolumeClaim updates the volume's claim count and allocation list
func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, claim *structs.CSIVolumeClaim) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	row, err := txn.First("csi_volumes", "id", namespace, id)
	if err != nil {
		return fmt.Errorf("volume lookup failed: %s: %v", id, err)
	}
	if row == nil {
		return fmt.Errorf("volume not found: %s", id)
	}

	orig, ok := row.(*structs.CSIVolume)
	if !ok {
		return fmt.Errorf("volume row conversion error")
	}

	var alloc *structs.Allocation
	if claim.State == structs.CSIVolumeClaimStateTaken {
		alloc, err = s.allocByIDImpl(txn, nil, claim.AllocationID)
		if err != nil {
			s.logger.Error("AllocByID failed", "error", err)
			return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
		}
		if alloc == nil {
			s.logger.Error("AllocByID failed to find alloc", "alloc_id", claim.AllocationID)
			if err != nil {
				return fmt.Errorf(structs.ErrUnknownAllocationPrefix)
			}
		}
	}

	volume, err := s.csiVolumeDenormalizePluginsTxn(txn, orig.Copy())
	if err != nil {
		return err
	}
	volume, err = s.csiVolumeDenormalizeTxn(txn, nil, volume)
	if err != nil {
		return err
	}

	// in the case of a job deregistration, there will be no allocation ID
	// for the claim but we still want to write an updated index to the volume
	// so that volume reaping is triggered
	if claim.AllocationID != "" {
		err = volume.Claim(claim, alloc)
		if err != nil {
			return err
		}
	}

	volume.ModifyIndex = index

	// Allocations are copy on write, so we want to keep the Allocation ID
	// but we need to clear the pointer so that we don't store it when we
	// write the volume to the state store. We'll get it from the db in
	// denormalize.
	for allocID := range volume.ReadAllocs {
		volume.ReadAllocs[allocID] = nil
	}
	for allocID := range volume.WriteAllocs {
		volume.WriteAllocs[allocID] = nil
	}

	if err = txn.Insert("csi_volumes", volume); err != nil {
		return fmt.Errorf("volume update failed: %s: %v", id, err)
	}

	if err = txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// CSIVolumeDeregister removes the volume from the server
func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []string, force bool) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	for _, id := range ids {
		existing, err := txn.First("csi_volumes", "id", namespace, id)
		if err != nil {
			return fmt.Errorf("volume lookup failed: %s: %v", id, err)
		}

		if existing == nil {
			return fmt.Errorf("volume not found: %s", id)
		}

		vol, ok := existing.(*structs.CSIVolume)
		if !ok {
			return fmt.Errorf("volume row conversion error: %s", id)
		}

		// The common case for a volume deregister is when the volume is
		// unused, but we can also let an operator intervene in the case where
		// allocations have been stopped but claims can't be freed because
		// ex. the plugins have all been removed.
		if vol.InUse() {
			if !force || !s.volSafeToForce(txn, vol) {
				return fmt.Errorf("volume in use: %s", id)
			}
		}

		if err = txn.Delete("csi_volumes", existing); err != nil {
			return fmt.Errorf("volume delete failed: %s: %v", id, err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// volSafeToForce checks if the any of the remaining allocations
// are in a non-terminal state.
func (s *StateStore) volSafeToForce(txn Txn, v *structs.CSIVolume) bool {
	v = v.Copy()
	vol, err := s.csiVolumeDenormalizeTxn(txn, nil, v)
	if err != nil {
		return false
	}

	for _, alloc := range vol.ReadAllocs {
		if alloc != nil && !alloc.TerminalStatus() {
			return false
		}
	}
	for _, alloc := range vol.WriteAllocs {
		if alloc != nil && !alloc.TerminalStatus() {
			return false
		}
	}
	return true
}

// CSIVolumeDenormalizePlugins returns a CSIVolume with current health and
// plugins, but without allocations.
// Use this for current volume metadata, handling lists of volumes.
// Use CSIVolumeDenormalize for volumes containing both health and current
// allocations.
func (s *StateStore) CSIVolumeDenormalizePlugins(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
	if vol == nil {
		return nil, nil
	}
	txn := s.db.ReadTxn()
	defer txn.Abort()
	return s.csiVolumeDenormalizePluginsTxn(txn, vol)
}

// csiVolumeDenormalizePluginsTxn implements
// CSIVolumeDenormalizePlugins, inside a transaction.
func (s *StateStore) csiVolumeDenormalizePluginsTxn(txn Txn, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
	if vol == nil {
		return nil, nil
	}
	plug, err := s.CSIPluginByIDTxn(txn, nil, vol.PluginID)
	if err != nil {
		return nil, fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err)
	}
	if plug == nil {
		vol.ControllersHealthy = 0
		vol.NodesHealthy = 0
		vol.Schedulable = false
		return vol, nil
	}

	vol.Provider = plug.Provider
	vol.ProviderVersion = plug.Version
	vol.ControllerRequired = plug.ControllerRequired
	vol.ControllersHealthy = plug.ControllersHealthy
	vol.NodesHealthy = plug.NodesHealthy

	// This value may be stale, but stale is ok
	vol.ControllersExpected = plug.ControllersExpected
	vol.NodesExpected = plug.NodesExpected

	vol.Schedulable = vol.NodesHealthy > 0
	if vol.ControllerRequired {
		vol.Schedulable = vol.ControllersHealthy > 0 && vol.Schedulable
	}

	return vol, nil
}

// CSIVolumeDenormalize returns a CSIVolume with its current
// Allocations and Claims, including creating new PastClaims for
// terminal or garbage collected allocations. This ensures we have a
// consistent state. Note that it mutates the original volume and so
// should always be called on a Copy after reading from the state
// store.
func (s *StateStore) CSIVolumeDenormalize(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
	txn := s.db.ReadTxn()
	return s.csiVolumeDenormalizeTxn(txn, ws, vol)
}

// csiVolumeDenormalizeTxn implements CSIVolumeDenormalize inside a transaction
func (s *StateStore) csiVolumeDenormalizeTxn(txn Txn, ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) {
	if vol == nil {
		return nil, nil
	}

	// note: denormalize mutates the maps we pass in!
	denormalize := func(
		currentAllocs map[string]*structs.Allocation,
		currentClaims, pastClaims map[string]*structs.CSIVolumeClaim,
		fallbackMode structs.CSIVolumeClaimMode) error {

		for id := range currentAllocs {
			a, err := s.allocByIDImpl(txn, ws, id)
			if err != nil {
				return err
			}
			pastClaim := pastClaims[id]
			currentClaim := currentClaims[id]
			if currentClaim == nil {
				// COMPAT(1.4.0): the CSIVolumeClaim fields were added
				// after 0.11.1, so claims made before that may be
				// missing this value. No clusters should see this
				// anymore, so warn nosily in the logs so that
				// operators ask us about it. Remove this block and
				// the now-unused fallbackMode parameter, and return
				// an error if currentClaim is nil in 1.4.0
				s.logger.Warn("volume was missing claim for allocation",
					"volume_id", vol.ID, "alloc", id)
				currentClaim = &structs.CSIVolumeClaim{
					AllocationID: a.ID,
					NodeID:       a.NodeID,
					Mode:         fallbackMode,
					State:        structs.CSIVolumeClaimStateTaken,
				}
				currentClaims[id] = currentClaim
			}

			currentAllocs[id] = a
			if (a == nil || a.TerminalStatus()) && pastClaim == nil {
				// the alloc is garbage collected but nothing has written a PastClaim,
				// so create one now
				pastClaim = &structs.CSIVolumeClaim{
					AllocationID:   id,
					NodeID:         currentClaim.NodeID,
					Mode:           currentClaim.Mode,
					State:          structs.CSIVolumeClaimStateUnpublishing,
					AccessMode:     currentClaim.AccessMode,
					AttachmentMode: currentClaim.AttachmentMode,
				}
				pastClaims[id] = pastClaim
			}

		}
		return nil
	}

	err := denormalize(vol.ReadAllocs, vol.ReadClaims, vol.PastClaims,
		structs.CSIVolumeClaimRead)
	if err != nil {
		return nil, err
	}
	err = denormalize(vol.WriteAllocs, vol.WriteClaims, vol.PastClaims,
		structs.CSIVolumeClaimWrite)
	if err != nil {
		return nil, err
	}

	// COMPAT: the AccessMode and AttachmentMode fields were added to claims
	// in 1.1.0, so claims made before that may be missing this value. In this
	// case, the volume will already have AccessMode/AttachmentMode until it
	// no longer has any claims, so set from those values
	for _, claim := range vol.ReadClaims {
		if claim.AccessMode == "" || claim.AttachmentMode == "" {
			claim.AccessMode = vol.AccessMode
			claim.AttachmentMode = vol.AttachmentMode
		}
	}
	for _, claim := range vol.WriteClaims {
		if claim.AccessMode == "" || claim.AttachmentMode == "" {
			claim.AccessMode = vol.AccessMode
			claim.AttachmentMode = vol.AttachmentMode
		}
	}

	return vol, nil
}

// CSIPlugins returns the unfiltered list of all plugin health status
func (s *StateStore) CSIPlugins(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	iter, err := txn.Get("csi_plugins", "id")
	if err != nil {
		return nil, fmt.Errorf("csi_plugins lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// CSIPluginsByIDPrefix supports search
func (s *StateStore) CSIPluginsByIDPrefix(ws memdb.WatchSet, pluginID string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("csi_plugins", "id_prefix", pluginID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// CSIPluginByID returns a named CSIPlugin. This method creates a new
// transaction so you should not call it from within another transaction.
func (s *StateStore) CSIPluginByID(ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {
	txn := s.db.ReadTxn()
	plugin, err := s.CSIPluginByIDTxn(txn, ws, id)
	if err != nil {
		return nil, err
	}
	return plugin, nil
}

// CSIPluginByIDTxn returns a named CSIPlugin
func (s *StateStore) CSIPluginByIDTxn(txn Txn, ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) {

	watchCh, obj, err := txn.FirstWatch("csi_plugins", "id", id)
	if err != nil {
		return nil, fmt.Errorf("csi_plugin lookup failed: %s %v", id, err)
	}

	ws.Add(watchCh)

	if obj != nil {
		return obj.(*structs.CSIPlugin), nil
	}
	return nil, nil
}

// CSIPluginDenormalize returns a CSIPlugin with allocation details. Always called on a copy of the plugin.
func (s *StateStore) CSIPluginDenormalize(ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
	txn := s.db.ReadTxn()
	return s.CSIPluginDenormalizeTxn(txn, ws, plug)
}

func (s *StateStore) CSIPluginDenormalizeTxn(txn Txn, ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) {
	if plug == nil {
		return nil, nil
	}

	// Get the unique list of allocation ids
	ids := map[string]struct{}{}
	for _, info := range plug.Controllers {
		ids[info.AllocID] = struct{}{}
	}
	for _, info := range plug.Nodes {
		ids[info.AllocID] = struct{}{}
	}

	for id := range ids {
		alloc, err := s.allocByIDImpl(txn, ws, id)
		if err != nil {
			return nil, err
		}
		if alloc == nil {
			continue
		}
		plug.Allocations = append(plug.Allocations, alloc.Stub(nil))
	}
	sort.Slice(plug.Allocations, func(i, j int) bool {
		return plug.Allocations[i].ModifyIndex > plug.Allocations[j].ModifyIndex
	})

	return plug, nil
}

// UpsertCSIPlugin writes the plugin to the state store. Note: there
// is currently no raft message for this, as it's intended to support
// testing use cases.
func (s *StateStore) UpsertCSIPlugin(index uint64, plug *structs.CSIPlugin) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	existing, err := txn.First("csi_plugins", "id", plug.ID)
	if err != nil {
		return fmt.Errorf("csi_plugin lookup error: %s %v", plug.ID, err)
	}

	plug.ModifyIndex = index
	if existing != nil {
		plug.CreateIndex = existing.(*structs.CSIPlugin).CreateIndex
	}

	err = txn.Insert("csi_plugins", plug)
	if err != nil {
		return fmt.Errorf("csi_plugins insert error: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// DeleteCSIPlugin deletes the plugin if it's not in use.
func (s *StateStore) DeleteCSIPlugin(index uint64, id string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	plug, err := s.CSIPluginByIDTxn(txn, nil, id)
	if err != nil {
		return err
	}

	if plug == nil {
		return nil
	}

	plug, err = s.CSIPluginDenormalizeTxn(txn, nil, plug.Copy())
	if err != nil {
		return err
	}
	if !plug.IsEmpty() {
		return fmt.Errorf("plugin in use")
	}

	err = txn.Delete("csi_plugins", plug)
	if err != nil {
		return fmt.Errorf("csi_plugins delete error: %v", err)
	}
	return txn.Commit()
}

// UpsertPeriodicLaunch is used to register a launch or update it.
func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Check if the job already exists
	existing, err := txn.First("periodic_launch", "id", launch.Namespace, launch.ID)
	if err != nil {
		return fmt.Errorf("periodic launch lookup failed: %v", err)
	}

	// Setup the indexes correctly
	if existing != nil {
		launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex
		launch.ModifyIndex = index
	} else {
		launch.CreateIndex = index
		launch.ModifyIndex = index
	}

	// Insert the job
	if err := txn.Insert("periodic_launch", launch); err != nil {
		return fmt.Errorf("launch insert failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// DeletePeriodicLaunch is used to delete the periodic launch
func (s *StateStore) DeletePeriodicLaunch(index uint64, namespace, jobID string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	err := s.DeletePeriodicLaunchTxn(index, namespace, jobID, txn)
	if err == nil {
		return txn.Commit()
	}
	return err
}

// DeletePeriodicLaunchTxn is used to delete the periodic launch, like DeletePeriodicLaunch
// but in a transaction.  Useful for when making multiple modifications atomically
func (s *StateStore) DeletePeriodicLaunchTxn(index uint64, namespace, jobID string, txn Txn) error {
	// Lookup the launch
	existing, err := txn.First("periodic_launch", "id", namespace, jobID)
	if err != nil {
		return fmt.Errorf("launch lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("launch not found")
	}

	// Delete the launch
	if err := txn.Delete("periodic_launch", existing); err != nil {
		return fmt.Errorf("launch delete failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// PeriodicLaunchByID is used to lookup a periodic launch by the periodic job
// ID.
func (s *StateStore) PeriodicLaunchByID(ws memdb.WatchSet, namespace, id string) (*structs.PeriodicLaunch, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("periodic_launch", "id", namespace, id)
	if err != nil {
		return nil, fmt.Errorf("periodic launch lookup failed: %v", err)
	}

	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.PeriodicLaunch), nil
	}
	return nil, nil
}

// PeriodicLaunches returns an iterator over all the periodic launches
func (s *StateStore) PeriodicLaunches(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire table
	iter, err := txn.Get("periodic_launch", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// UpsertEvals is used to upsert a set of evaluations
func (s *StateStore) UpsertEvals(msgType structs.MessageType, index uint64, evals []*structs.Evaluation) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	err := s.UpsertEvalsTxn(index, evals, txn)
	if err == nil {
		return txn.Commit()
	}
	return err
}

// UpsertEvalsTxn is used to upsert a set of evaluations, like UpsertEvals but
// in a transaction.  Useful for when making multiple modifications atomically.
func (s *StateStore) UpsertEvalsTxn(index uint64, evals []*structs.Evaluation, txn Txn) error {
	// Do a nested upsert
	jobs := make(map[structs.NamespacedID]string, len(evals))
	for _, eval := range evals {
		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
			return err
		}

		tuple := structs.NamespacedID{
			ID:        eval.JobID,
			Namespace: eval.Namespace,
		}
		jobs[tuple] = ""
	}

	// Set the job's status
	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
		return fmt.Errorf("setting job status failed: %v", err)
	}

	return nil
}

// nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction
func (s *StateStore) nestedUpsertEval(txn *txn, index uint64, eval *structs.Evaluation) error {
	// Lookup the evaluation
	existing, err := txn.First("evals", "id", eval.ID)
	if err != nil {
		return fmt.Errorf("eval lookup failed: %v", err)
	}

	// Update the indexes
	if existing != nil {
		eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex
		eval.ModifyIndex = index
	} else {
		eval.CreateIndex = index
		eval.ModifyIndex = index
	}

	// Update the job summary
	summaryRaw, err := txn.First("job_summary", "id", eval.Namespace, eval.JobID)
	if err != nil {
		return fmt.Errorf("job summary lookup failed: %v", err)
	}
	if summaryRaw != nil {
		js := summaryRaw.(*structs.JobSummary).Copy()
		hasSummaryChanged := false
		for tg, num := range eval.QueuedAllocations {
			if summary, ok := js.Summary[tg]; ok {
				if summary.Queued != num {
					summary.Queued = num
					js.Summary[tg] = summary
					hasSummaryChanged = true
				}
			} else {
				s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace)
			}
		}

		// Insert the job summary
		if hasSummaryChanged {
			js.ModifyIndex = index
			if err := txn.Insert("job_summary", js); err != nil {
				return fmt.Errorf("job summary insert failed: %v", err)
			}
			if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
				return fmt.Errorf("index update failed: %v", err)
			}
		}
	}

	// Check if the job has any blocked evaluations and cancel them
	if eval.Status == structs.EvalStatusComplete && len(eval.FailedTGAllocs) == 0 {
		// Get the blocked evaluation for a job if it exists
		iter, err := txn.Get("evals", "job", eval.Namespace, eval.JobID, structs.EvalStatusBlocked)
		if err != nil {
			return fmt.Errorf("failed to get blocked evals for job %q in namespace %q: %v", eval.JobID, eval.Namespace, err)
		}

		var blocked []*structs.Evaluation
		for {
			raw := iter.Next()
			if raw == nil {
				break
			}
			blocked = append(blocked, raw.(*structs.Evaluation))
		}

		// Go through and update the evals
		for _, eval := range blocked {
			newEval := eval.Copy()
			newEval.Status = structs.EvalStatusCancelled
			newEval.StatusDescription = fmt.Sprintf("evaluation %q successful", newEval.ID)
			newEval.ModifyIndex = index

			if err := txn.Insert("evals", newEval); err != nil {
				return fmt.Errorf("eval insert failed: %v", err)
			}
		}
	}

	// Insert the eval
	if err := txn.Insert("evals", eval); err != nil {
		return fmt.Errorf("eval insert failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return nil
}

// updateEvalModifyIndex is used to update the modify index of an evaluation that has been
// through a scheduler pass. This is done as part of plan apply. It ensures that when a subsequent
// scheduler workers process a re-queued evaluation it sees any partial updates from the plan apply.
func (s *StateStore) updateEvalModifyIndex(txn *txn, index uint64, evalID string) error {
	// Lookup the evaluation
	existing, err := txn.First("evals", "id", evalID)
	if err != nil {
		return fmt.Errorf("eval lookup failed: %v", err)
	}
	if existing == nil {
		s.logger.Error("unable to find eval", "eval_id", evalID)
		return fmt.Errorf("unable to find eval id %q", evalID)
	}
	eval := existing.(*structs.Evaluation).Copy()
	// Update the indexes
	eval.ModifyIndex = index

	// Insert the eval
	if err := txn.Insert("evals", eval); err != nil {
		return fmt.Errorf("eval insert failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return nil
}

// DeleteEvalsByFilter is used to delete all evals that are both safe to delete
// and match a filter.
func (s *StateStore) DeleteEvalsByFilter(index uint64, filterExpr string, pageToken string, perPage int32) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// These are always user-initiated, so ensure the eval broker is paused.
	_, schedConfig, err := s.schedulerConfigTxn(txn)
	if err != nil {
		return err
	}
	if schedConfig == nil || !schedConfig.PauseEvalBroker {
		return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
	}

	filter, err := bexpr.CreateEvaluator(filterExpr)
	if err != nil {
		return err
	}

	iter, err := s.Evals(nil, SortDefault)
	if err != nil {
		return fmt.Errorf("failed to lookup evals: %v", err)
	}

	// Note: Paginator imports this package for testing so we can't just use
	// Paginator
	pageCount := int32(0)

	for {
		if pageCount >= perPage {
			break
		}
		raw := iter.Next()
		if raw == nil {
			break
		}
		eval := raw.(*structs.Evaluation)
		if eval.ID < pageToken {
			continue
		}

		deleteOk, err := s.EvalIsUserDeleteSafe(nil, eval)
		if !deleteOk || err != nil {
			continue
		}
		match, err := filter.Evaluate(eval)
		if !match || err != nil {
			continue
		}
		if err := txn.Delete("evals", eval); err != nil {
			return fmt.Errorf("eval delete failed: %v", err)
		}
		pageCount++
	}

	err = txn.Commit()
	return err
}

// EvalIsUserDeleteSafe ensures an evaluation is safe to delete based on its
// related allocation and job information. This follows similar, but different
// rules to the eval reap checking, to ensure evaluations for running allocs or
// allocs which need the evaluation detail are not deleted.
//
// Returns both a bool and an error so that error in querying the related
// objects can be differentiated from reporting that the eval isn't safe to
// delete.
func (s *StateStore) EvalIsUserDeleteSafe(ws memdb.WatchSet, eval *structs.Evaluation) (bool, error) {

	job, err := s.JobByID(ws, eval.Namespace, eval.JobID)
	if err != nil {
		return false, fmt.Errorf("failed to lookup job for eval: %v", err)
	}

	allocs, err := s.AllocsByEval(ws, eval.ID)
	if err != nil {
		return false, fmt.Errorf("failed to lookup eval allocs: %v", err)
	}

	return isEvalDeleteSafe(allocs, job), nil
}

func isEvalDeleteSafe(allocs []*structs.Allocation, job *structs.Job) bool {

	// If the job is deleted, stopped, or dead, all allocs are terminal and
	// the eval can be deleted.
	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
		return true
	}

	// Iterate the allocations associated to the eval, if any, and check
	// whether we can delete the eval.
	for _, alloc := range allocs {

		// If the allocation is still classed as running on the client, or
		// might be, we can't delete.
		switch alloc.ClientStatus {
		case structs.AllocClientStatusRunning, structs.AllocClientStatusUnknown:
			return false
		}

		// If the alloc hasn't failed then we don't need to consider it for
		// rescheduling. Rescheduling needs to copy over information from the
		// previous alloc so that it can enforce the reschedule policy.
		if alloc.ClientStatus != structs.AllocClientStatusFailed {
			continue
		}

		var reschedulePolicy *structs.ReschedulePolicy
		tg := job.LookupTaskGroup(alloc.TaskGroup)

		if tg != nil {
			reschedulePolicy = tg.ReschedulePolicy
		}

		// No reschedule policy or rescheduling is disabled
		if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
			continue
		}

		// The restart tracking information has not been carried forward.
		if alloc.NextAllocation == "" {
			return false
		}

		// This task has unlimited rescheduling and the alloc has not been
		// replaced, so we can't delete the eval yet.
		if reschedulePolicy.Unlimited {
			return false
		}

		// No restarts have been attempted yet.
		if alloc.RescheduleTracker == nil || len(alloc.RescheduleTracker.Events) == 0 {
			return false
		}
	}

	return true
}

// DeleteEval is used to delete an evaluation
func (s *StateStore) DeleteEval(index uint64, evals, allocs []string, userInitiated bool) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// If this deletion has been initiated by an operator, ensure the eval
	// broker is paused.
	if userInitiated {
		_, schedConfig, err := s.schedulerConfigTxn(txn)
		if err != nil {
			return err
		}
		if schedConfig == nil || !schedConfig.PauseEvalBroker {
			return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
		}
	}

	jobs := make(map[structs.NamespacedID]string, len(evals))

	// evalsTableUpdated and allocsTableUpdated allow us to track whether each
	// table has been modified. This allows us to skip updating the index table
	// entries if we do not need to.
	var evalsTableUpdated, allocsTableUpdated bool

	for _, eval := range evals {
		existing, err := txn.First("evals", "id", eval)
		if err != nil {
			return fmt.Errorf("eval lookup failed: %v", err)
		}
		if existing == nil {
			continue
		}
		if err := txn.Delete("evals", existing); err != nil {
			return fmt.Errorf("eval delete failed: %v", err)
		}

		// Mark that we have made a successful modification to the evals
		// table.
		evalsTableUpdated = true

		eval := existing.(*structs.Evaluation)

		tuple := structs.NamespacedID{
			ID:        eval.JobID,
			Namespace: eval.Namespace,
		}
		jobs[tuple] = ""
	}

	for _, alloc := range allocs {
		raw, err := txn.First("allocs", "id", alloc)
		if err != nil {
			return fmt.Errorf("alloc lookup failed: %v", err)
		}
		if raw == nil {
			continue
		}
		if err := txn.Delete("allocs", raw); err != nil {
			return fmt.Errorf("alloc delete failed: %v", err)
		}

		// Mark that we have made a successful modification to the allocs
		// table.
		allocsTableUpdated = true
	}

	// Update the indexes
	if evalsTableUpdated {
		if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}
	if allocsTableUpdated {
		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}

	// Set the job's status
	if err := s.setJobStatuses(index, txn, jobs, true); err != nil {
		return fmt.Errorf("setting job status failed: %v", err)
	}

	return txn.Commit()
}

// EvalByID is used to lookup an eval by its ID
func (s *StateStore) EvalByID(ws memdb.WatchSet, id string) (*structs.Evaluation, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("evals", "id", id)
	if err != nil {
		return nil, fmt.Errorf("eval lookup failed: %v", err)
	}

	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.Evaluation), nil
	}
	return nil, nil
}

// EvalsRelatedToID is used to retrieve the evals that are related (next,
// previous, or blocked) to the provided eval ID.
func (s *StateStore) EvalsRelatedToID(ws memdb.WatchSet, id string) ([]*structs.EvaluationStub, error) {
	txn := s.db.ReadTxn()

	raw, err := txn.First("evals", "id", id)
	if err != nil {
		return nil, fmt.Errorf("eval lookup failed: %v", err)
	}
	if raw == nil {
		return nil, nil
	}
	eval := raw.(*structs.Evaluation)

	relatedEvals := []*structs.EvaluationStub{}
	todo := eval.RelatedIDs()
	done := map[string]bool{
		eval.ID: true, // don't place the requested eval in the related list.
	}

	for len(todo) > 0 {
		// Pop the first value from the todo list.
		current := todo[0]
		todo = todo[1:]
		if current == "" {
			continue
		}

		// Skip value if we already have it in the results.
		if done[current] {
			continue
		}

		eval, err := s.EvalByID(ws, current)
		if err != nil {
			return nil, err
		}
		if eval == nil {
			continue
		}

		todo = append(todo, eval.RelatedIDs()...)
		relatedEvals = append(relatedEvals, eval.Stub())
		done[eval.ID] = true
	}

	return relatedEvals, nil
}

// EvalsByIDPrefix is used to lookup evaluations by prefix in a particular
// namespace
func (s *StateStore) EvalsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var iter memdb.ResultIterator
	var err error

	// Get an iterator over all evals by the id prefix
	switch sort {
	case SortReverse:
		iter, err = txn.GetReverse("evals", "id_prefix", id)
	default:
		iter, err = txn.Get("evals", "id_prefix", id)
	}
	if err != nil {
		return nil, fmt.Errorf("eval lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	// Wrap the iterator in a filter
	wrap := memdb.NewFilterIterator(iter, evalNamespaceFilter(namespace))
	return wrap, nil
}

// evalNamespaceFilter returns a filter function that filters all evaluations
// not in the given namespace.
func evalNamespaceFilter(namespace string) func(interface{}) bool {
	return func(raw interface{}) bool {
		eval, ok := raw.(*structs.Evaluation)
		if !ok {
			return true
		}

		return namespace != structs.AllNamespacesSentinel &&
			eval.Namespace != namespace
	}
}

// EvalsByJob returns all the evaluations by job id
func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]*structs.Evaluation, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the node allocations
	iter, err := txn.Get("evals", "job_prefix", namespace, jobID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Evaluation
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		e := raw.(*structs.Evaluation)

		// Filter non-exact matches
		if e.JobID != jobID {
			continue
		}

		out = append(out, e)
	}
	return out, nil
}

// Evals returns an iterator over all the evaluations in ascending or descending
// order of CreationIndex as determined by the reverse parameter.
func (s *StateStore) Evals(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var it memdb.ResultIterator
	var err error

	switch sort {
	case SortReverse:
		it, err = txn.GetReverse("evals", "create")
	default:
		it, err = txn.Get("evals", "create")
	}

	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

// EvalsByNamespace returns an iterator over all evaluations in no particular
// order.
//
// todo(shoenig): can this be removed?
func (s *StateStore) EvalsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	it, err := txn.Get("evals", "namespace", namespace)
	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

func (s *StateStore) EvalsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var (
		it    memdb.ResultIterator
		err   error
		exact = terminate(namespace)
	)

	switch sort {
	case SortReverse:
		it, err = txn.GetReverse("evals", "namespace_create_prefix", exact)
	default:
		it, err = txn.Get("evals", "namespace_create_prefix", exact)
	}

	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

// UpdateAllocsFromClient is used to update an allocation based on input
// from a client. While the schedulers are the authority on the allocation for
// most things, some updates are authoritative from the client. Specifically,
// the desired state comes from the schedulers, while the actual state comes
// from clients.
func (s *StateStore) UpdateAllocsFromClient(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Capture all nodes being affected. Alloc updates from clients are batched
	// so this request may include allocs from several nodes.
	nodeIDs := set.New[string](1)

	// Handle each of the updated allocations
	for _, alloc := range allocs {
		nodeIDs.Insert(alloc.NodeID)
		if err := s.nestedUpdateAllocFromClient(txn, index, alloc); err != nil {
			return err
		}
	}

	// Update the indexes
	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// Update the index of when nodes last updated their allocs.
	for _, nodeID := range nodeIDs.List() {
		if err := s.updateClientAllocUpdateIndex(txn, index, nodeID); err != nil {
			return fmt.Errorf("node update failed: %v", err)
		}
	}

	return txn.Commit()
}

// nestedUpdateAllocFromClient is used to nest an update of an allocation with client status
func (s *StateStore) nestedUpdateAllocFromClient(txn *txn, index uint64, alloc *structs.Allocation) error {
	// Look for existing alloc
	existing, err := txn.First("allocs", "id", alloc.ID)
	if err != nil {
		return fmt.Errorf("alloc lookup failed: %v", err)
	}

	// Nothing to do if this does not exist
	if existing == nil {
		return nil
	}
	exist := existing.(*structs.Allocation)

	// Copy everything from the existing allocation
	copyAlloc := exist.Copy()

	// Pull in anything the client is the authority on
	copyAlloc.ClientStatus = alloc.ClientStatus
	copyAlloc.ClientDescription = alloc.ClientDescription
	copyAlloc.TaskStates = alloc.TaskStates
	copyAlloc.NetworkStatus = alloc.NetworkStatus

	// The client can only set its deployment health and timestamp, so just take
	// those
	if copyAlloc.DeploymentStatus != nil && alloc.DeploymentStatus != nil {
		oldHasHealthy := copyAlloc.DeploymentStatus.HasHealth()
		newHasHealthy := alloc.DeploymentStatus.HasHealth()

		// We got new health information from the client
		if newHasHealthy && (!oldHasHealthy || *copyAlloc.DeploymentStatus.Healthy != *alloc.DeploymentStatus.Healthy) {
			// Updated deployment health and timestamp
			copyAlloc.DeploymentStatus.Healthy = pointer.Of(*alloc.DeploymentStatus.Healthy)
			copyAlloc.DeploymentStatus.Timestamp = alloc.DeploymentStatus.Timestamp
			copyAlloc.DeploymentStatus.ModifyIndex = index
		}
	} else if alloc.DeploymentStatus != nil {
		// First time getting a deployment status so copy everything and just
		// set the index
		copyAlloc.DeploymentStatus = alloc.DeploymentStatus.Copy()
		copyAlloc.DeploymentStatus.ModifyIndex = index
	}

	// Update the modify index
	copyAlloc.ModifyIndex = index

	// Update the modify time
	copyAlloc.ModifyTime = alloc.ModifyTime

	if err := s.updateDeploymentWithAlloc(index, copyAlloc, exist, txn); err != nil {
		return fmt.Errorf("error updating deployment: %v", err)
	}

	if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, txn); err != nil {
		return fmt.Errorf("error updating job summary: %v", err)
	}

	if err := s.updateEntWithAlloc(index, copyAlloc, exist, txn); err != nil {
		return err
	}

	if err := s.updatePluginForTerminalAlloc(index, copyAlloc, txn); err != nil {
		return err
	}

	// Update the allocation
	if err := txn.Insert("allocs", copyAlloc); err != nil {
		return fmt.Errorf("alloc insert failed: %v", err)
	}

	// Set the job's status
	forceStatus := ""
	if !copyAlloc.TerminalStatus() {
		forceStatus = structs.JobStatusRunning
	}

	tuple := structs.NamespacedID{
		ID:        exist.JobID,
		Namespace: exist.Namespace,
	}
	jobs := map[structs.NamespacedID]string{tuple: forceStatus}

	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
		return fmt.Errorf("setting job status failed: %v", err)
	}
	return nil
}

func (s *StateStore) updateClientAllocUpdateIndex(txn *txn, index uint64, nodeID string) error {
	existing, err := txn.First("nodes", "id", nodeID)
	if err != nil {
		return fmt.Errorf("node lookup failed: %v", err)
	}
	if existing == nil {
		return nil
	}

	node := existing.(*structs.Node)
	copyNode := node.Copy()
	copyNode.LastAllocUpdateIndex = index

	if err := txn.Insert("nodes", copyNode); err != nil {
		return fmt.Errorf("node update failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"nodes", txn.Index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return nil
}

// UpsertAllocs is used to evict a set of allocations and allocate new ones at
// the same time.
func (s *StateStore) UpsertAllocs(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()
	if err := s.upsertAllocsImpl(index, allocs, txn); err != nil {
		return err
	}
	return txn.Commit()
}

// upsertAllocs is the actual implementation of UpsertAllocs so that it may be
// used with an existing transaction.
func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation, txn *txn) error {
	// Handle the allocations
	jobs := make(map[structs.NamespacedID]string, 1)
	for _, alloc := range allocs {
		existing, err := txn.First("allocs", "id", alloc.ID)
		if err != nil {
			return fmt.Errorf("alloc lookup failed: %v", err)
		}
		exist, _ := existing.(*structs.Allocation)

		if exist == nil {
			alloc.CreateIndex = index
			alloc.ModifyIndex = index
			alloc.AllocModifyIndex = index
			if alloc.DeploymentStatus != nil {
				alloc.DeploymentStatus.ModifyIndex = index
			}

			// Issue https://github.com/hashicorp/nomad/issues/2583 uncovered
			// the a race between a forced garbage collection and the scheduler
			// marking an allocation as terminal. The issue is that the
			// allocation from the scheduler has its job normalized and the FSM
			// will only denormalize if the allocation is not terminal.  However
			// if the allocation is garbage collected, that will result in a
			// allocation being upserted for the first time without a job
			// attached. By returning an error here, it will cause the FSM to
			// error, causing the plan_apply to error and thus causing the
			// evaluation to be failed. This will force an index refresh that
			// should solve this issue.
			if alloc.Job == nil {
				return fmt.Errorf("attempting to upsert allocation %q without a job", alloc.ID)
			}
		} else {
			alloc.CreateIndex = exist.CreateIndex
			alloc.ModifyIndex = index
			alloc.AllocModifyIndex = index

			// Keep the clients task states
			alloc.TaskStates = exist.TaskStates

			// If the scheduler is marking this allocation as lost or unknown we do not
			// want to reuse the status of the existing allocation.
			if alloc.ClientStatus != structs.AllocClientStatusLost &&
				alloc.ClientStatus != structs.AllocClientStatusUnknown {
				alloc.ClientStatus = exist.ClientStatus
				alloc.ClientDescription = exist.ClientDescription
			}

			// The job has been denormalized so re-attach the original job
			if alloc.Job == nil {
				alloc.Job = exist.Job
			}
		}

		// OPTIMIZATION:
		// These should be given a map of new to old allocation and the updates
		// should be one on all changes. The current implementation causes O(n)
		// lookups/copies/insertions rather than O(1)
		if err := s.updateDeploymentWithAlloc(index, alloc, exist, txn); err != nil {
			return fmt.Errorf("error updating deployment: %v", err)
		}

		if err := s.updateSummaryWithAlloc(index, alloc, exist, txn); err != nil {
			return fmt.Errorf("error updating job summary: %v", err)
		}

		if err := s.updateEntWithAlloc(index, alloc, exist, txn); err != nil {
			return err
		}

		if err := s.updatePluginForTerminalAlloc(index, alloc, txn); err != nil {
			return err
		}

		if err := txn.Insert("allocs", alloc); err != nil {
			return fmt.Errorf("alloc insert failed: %v", err)
		}

		if alloc.PreviousAllocation != "" {
			prevAlloc, err := txn.First("allocs", "id", alloc.PreviousAllocation)
			if err != nil {
				return fmt.Errorf("alloc lookup failed: %v", err)
			}
			existingPrevAlloc, _ := prevAlloc.(*structs.Allocation)
			if existingPrevAlloc != nil {
				prevAllocCopy := existingPrevAlloc.Copy()
				prevAllocCopy.NextAllocation = alloc.ID
				prevAllocCopy.ModifyIndex = index
				if err := txn.Insert("allocs", prevAllocCopy); err != nil {
					return fmt.Errorf("alloc insert failed: %v", err)
				}
			}
		}

		// If the allocation is running, force the job to running status.
		forceStatus := ""
		if !alloc.TerminalStatus() {
			forceStatus = structs.JobStatusRunning
		}

		tuple := structs.NamespacedID{
			ID:        alloc.JobID,
			Namespace: alloc.Namespace,
		}
		jobs[tuple] = forceStatus
	}

	// Update the indexes
	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// Set the job's status
	if err := s.setJobStatuses(index, txn, jobs, false); err != nil {
		return fmt.Errorf("setting job status failed: %v", err)
	}

	return nil
}

// UpdateAllocsDesiredTransitions is used to update a set of allocations
// desired transitions.
func (s *StateStore) UpdateAllocsDesiredTransitions(msgType structs.MessageType, index uint64, allocs map[string]*structs.DesiredTransition,
	evals []*structs.Evaluation) error {

	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Handle each of the updated allocations
	for id, transition := range allocs {
		if err := s.UpdateAllocDesiredTransitionTxn(txn, index, id, transition); err != nil {
			return err
		}
	}

	for _, eval := range evals {
		if err := s.nestedUpsertEval(txn, index, eval); err != nil {
			return err
		}
	}

	// Update the indexes
	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// UpdateAllocDesiredTransitionTxn is used to nest an update of an
// allocations desired transition
func (s *StateStore) UpdateAllocDesiredTransitionTxn(
	txn *txn, index uint64, allocID string,
	transition *structs.DesiredTransition) error {

	// Look for existing alloc
	existing, err := txn.First("allocs", "id", allocID)
	if err != nil {
		return fmt.Errorf("alloc lookup failed: %v", err)
	}

	// Nothing to do if this does not exist
	if existing == nil {
		return nil
	}
	exist := existing.(*structs.Allocation)

	// Copy everything from the existing allocation
	copyAlloc := exist.Copy()

	// Merge the desired transitions
	copyAlloc.DesiredTransition.Merge(transition)

	// Update the modify indexes
	copyAlloc.ModifyIndex = index
	copyAlloc.AllocModifyIndex = index

	// Update the allocation
	if err := txn.Insert("allocs", copyAlloc); err != nil {
		return fmt.Errorf("alloc insert failed: %v", err)
	}

	return nil
}

// AllocByID is used to lookup an allocation by its ID
func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) {
	txn := s.db.ReadTxn()
	return s.allocByIDImpl(txn, ws, id)
}

// allocByIDImpl retrives an allocation and is called under and existing
// transaction. An optional watch set can be passed to add allocations to the
// watch set
func (s *StateStore) allocByIDImpl(txn Txn, ws memdb.WatchSet, id string) (*structs.Allocation, error) {
	watchCh, raw, err := txn.FirstWatch("allocs", "id", id)
	if err != nil {
		return nil, fmt.Errorf("alloc lookup failed: %v", err)
	}

	ws.Add(watchCh)

	if raw == nil {
		return nil, nil
	}
	alloc := raw.(*structs.Allocation)
	return alloc, nil
}

// AllocsByIDPrefix is used to lookup allocs by prefix
func (s *StateStore) AllocsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var iter memdb.ResultIterator
	var err error

	switch sort {
	case SortReverse:
		iter, err = txn.GetReverse("allocs", "id_prefix", id)
	default:
		iter, err = txn.Get("allocs", "id_prefix", id)
	}
	if err != nil {
		return nil, fmt.Errorf("alloc lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	// Wrap the iterator in a filter
	wrap := memdb.NewFilterIterator(iter, allocNamespaceFilter(namespace))
	return wrap, nil
}

// allocNamespaceFilter returns a filter function that filters all allocations
// not in the given namespace.
func allocNamespaceFilter(namespace string) func(interface{}) bool {
	return func(raw interface{}) bool {
		alloc, ok := raw.(*structs.Allocation)
		if !ok {
			return true
		}

		if namespace == structs.AllNamespacesSentinel {
			return false
		}

		return alloc.Namespace != namespace
	}
}

// AllocsByIDPrefixAllNSs is used to lookup allocs by prefix.
func (s *StateStore) AllocsByIDPrefixAllNSs(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("allocs", "id_prefix", prefix)
	if err != nil {
		return nil, fmt.Errorf("alloc lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// AllocsByNode returns all the allocations by node
func (s *StateStore) AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
	txn := s.db.ReadTxn()

	return allocsByNodeTxn(txn, ws, node)
}

func allocsByNodeTxn(txn ReadTxn, ws memdb.WatchSet, node string) ([]*structs.Allocation, error) {
	// Get an iterator over the node allocations, using only the
	// node prefix which ignores the terminal status
	iter, err := txn.Get("allocs", "node_prefix", node)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Allocation
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		out = append(out, raw.(*structs.Allocation))
	}
	return out, nil
}

// AllocsByNodeTerminal returns all the allocations by node and terminal
// status.
func (s *StateStore) AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the node allocations
	iter, err := txn.Get("allocs", "node", node, terminal)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Allocation
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		out = append(out, raw.(*structs.Allocation))
	}
	return out, nil
}

// AllocsByJob returns allocations by job id
func (s *StateStore) AllocsByJob(ws memdb.WatchSet, namespace, jobID string, anyCreateIndex bool) ([]*structs.Allocation, error) {
	txn := s.db.ReadTxn()

	// Get the job
	var job *structs.Job
	rawJob, err := txn.First("jobs", "id", namespace, jobID)
	if err != nil {
		return nil, err
	}
	if rawJob != nil {
		job = rawJob.(*structs.Job)
	}

	// Get an iterator over the node allocations
	iter, err := txn.Get("allocs", "job", namespace, jobID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Allocation
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		alloc := raw.(*structs.Allocation)
		// If the allocation belongs to a job with the same ID but a different
		// create index and we are not getting all the allocations whose Jobs
		// matches the same Job ID then we skip it
		if !anyCreateIndex && job != nil && alloc.Job.CreateIndex != job.CreateIndex {
			continue
		}
		out = append(out, raw.(*structs.Allocation))
	}
	return out, nil
}

// AllocsByEval returns all the allocations by eval id
func (s *StateStore) AllocsByEval(ws memdb.WatchSet, evalID string) ([]*structs.Allocation, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the eval allocations
	iter, err := txn.Get("allocs", "eval", evalID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Allocation
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		out = append(out, raw.(*structs.Allocation))
	}
	return out, nil
}

// AllocsByDeployment returns all the allocations by deployment id
func (s *StateStore) AllocsByDeployment(ws memdb.WatchSet, deploymentID string) ([]*structs.Allocation, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the deployments allocations
	iter, err := txn.Get("allocs", "deployment", deploymentID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.Allocation
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		out = append(out, raw.(*structs.Allocation))
	}
	return out, nil
}

// Allocs returns an iterator over all the evaluations.
func (s *StateStore) Allocs(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var it memdb.ResultIterator
	var err error

	switch sort {
	case SortReverse:
		it, err = txn.GetReverse("allocs", "create")
	default:
		it, err = txn.Get("allocs", "create")
	}

	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

func (s *StateStore) AllocsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var (
		it    memdb.ResultIterator
		err   error
		exact = terminate(namespace)
	)

	switch sort {
	case SortReverse:
		it, err = txn.GetReverse("allocs", "namespace_create_prefix", exact)
	default:
		it, err = txn.Get("allocs", "namespace_create_prefix", exact)
	}

	if err != nil {
		return nil, err
	}

	ws.Add(it.WatchCh())

	return it, nil
}

// AllocsByNamespace returns an iterator over all the allocations in the
// namespace
func (s *StateStore) AllocsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()
	return s.allocsByNamespaceImpl(ws, txn, namespace)
}

// allocsByNamespaceImpl returns an iterator over all the allocations in the
// namespace
func (s *StateStore) allocsByNamespaceImpl(ws memdb.WatchSet, txn *txn, namespace string) (memdb.ResultIterator, error) {
	// Walk the entire table
	iter, err := txn.Get("allocs", "namespace", namespace)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// UpsertVaultAccessor is used to register a set of Vault Accessors.
func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	for _, accessor := range accessors {
		// Set the create index
		accessor.CreateIndex = index

		// Insert the accessor
		if err := txn.Insert("vault_accessors", accessor); err != nil {
			return fmt.Errorf("accessor insert failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// DeleteVaultAccessors is used to delete a set of Vault Accessors
func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Lookup the accessor
	for _, accessor := range accessors {
		// Delete the accessor
		if err := txn.Delete("vault_accessors", accessor); err != nil {
			return fmt.Errorf("accessor delete failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// VaultAccessor returns the given Vault accessor
func (s *StateStore) VaultAccessor(ws memdb.WatchSet, accessor string) (*structs.VaultAccessor, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("vault_accessors", "id", accessor)
	if err != nil {
		return nil, fmt.Errorf("accessor lookup failed: %v", err)
	}

	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.VaultAccessor), nil
	}

	return nil, nil
}

// VaultAccessors returns an iterator of Vault accessors.
func (s *StateStore) VaultAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("vault_accessors", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// VaultAccessorsByAlloc returns all the Vault accessors by alloc id
func (s *StateStore) VaultAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.VaultAccessor, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the accessors
	iter, err := txn.Get("vault_accessors", "alloc_id", allocID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.VaultAccessor
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		out = append(out, raw.(*structs.VaultAccessor))
	}
	return out, nil
}

// VaultAccessorsByNode returns all the Vault accessors by node id
func (s *StateStore) VaultAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.VaultAccessor, error) {
	txn := s.db.ReadTxn()

	// Get an iterator over the accessors
	iter, err := txn.Get("vault_accessors", "node_id", nodeID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var out []*structs.VaultAccessor
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		out = append(out, raw.(*structs.VaultAccessor))
	}
	return out, nil
}

func indexEntry(table string, index uint64) *IndexEntry {
	return &IndexEntry{
		Key:   table,
		Value: index,
	}
}

const siTokenAccessorTable = "si_token_accessors"

// UpsertSITokenAccessors is used to register a set of Service Identity token accessors.
func (s *StateStore) UpsertSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	for _, accessor := range accessors {
		// set the create index
		accessor.CreateIndex = index

		// insert the accessor
		if err := txn.Insert(siTokenAccessorTable, accessor); err != nil {
			return fmt.Errorf("accessor insert failed: %w", err)
		}
	}

	// update the index for this table
	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
		return fmt.Errorf("index update failed: %w", err)
	}

	return txn.Commit()
}

// DeleteSITokenAccessors is used to delete a set of Service Identity token accessors.
func (s *StateStore) DeleteSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Lookup each accessor
	for _, accessor := range accessors {
		// Delete the accessor
		if err := txn.Delete(siTokenAccessorTable, accessor); err != nil {
			return fmt.Errorf("accessor delete failed: %w", err)
		}
	}

	// update the index for this table
	if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil {
		return fmt.Errorf("index update failed: %w", err)
	}

	return txn.Commit()
}

// SITokenAccessor returns the given Service Identity token accessor.
func (s *StateStore) SITokenAccessor(ws memdb.WatchSet, accessorID string) (*structs.SITokenAccessor, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	watchCh, existing, err := txn.FirstWatch(siTokenAccessorTable, "id", accessorID)
	if err != nil {
		return nil, fmt.Errorf("accessor lookup failed: %w", err)
	}

	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.SITokenAccessor), nil
	}

	return nil, nil
}

// SITokenAccessors returns an iterator of Service Identity token accessors.
func (s *StateStore) SITokenAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	iter, err := txn.Get(siTokenAccessorTable, "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// SITokenAccessorsByAlloc returns all the Service Identity token accessors by alloc ID.
func (s *StateStore) SITokenAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.SITokenAccessor, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	// Get an iterator over the accessors
	iter, err := txn.Get(siTokenAccessorTable, "alloc_id", allocID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var result []*structs.SITokenAccessor
	for raw := iter.Next(); raw != nil; raw = iter.Next() {
		result = append(result, raw.(*structs.SITokenAccessor))
	}

	return result, nil
}

// SITokenAccessorsByNode returns all the Service Identity token accessors by node ID.
func (s *StateStore) SITokenAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.SITokenAccessor, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	// Get an iterator over the accessors
	iter, err := txn.Get(siTokenAccessorTable, "node_id", nodeID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	var result []*structs.SITokenAccessor
	for raw := iter.Next(); raw != nil; raw = iter.Next() {
		result = append(result, raw.(*structs.SITokenAccessor))
	}

	return result, nil
}

// UpdateDeploymentStatus is used to make deployment status updates and
// potentially make a evaluation
func (s *StateStore) UpdateDeploymentStatus(msgType structs.MessageType, index uint64, req *structs.DeploymentStatusUpdateRequest) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
		return err
	}

	// Upsert the job if necessary
	if req.Job != nil {
		if err := s.upsertJobImpl(index, nil, req.Job, false, txn); err != nil {
			return err
		}
	}

	// Upsert the optional eval
	if req.Eval != nil {
		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
			return err
		}
	}

	return txn.Commit()
}

// updateDeploymentStatusImpl is used to make deployment status updates
func (s *StateStore) updateDeploymentStatusImpl(index uint64, u *structs.DeploymentStatusUpdate, txn *txn) error {
	// Retrieve deployment
	ws := memdb.NewWatchSet()
	deployment, err := s.deploymentByIDImpl(ws, u.DeploymentID, txn)
	if err != nil {
		return err
	} else if deployment == nil {
		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", u.DeploymentID)
	} else if !deployment.Active() {
		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
	}

	// Apply the new status
	copy := deployment.Copy()
	copy.Status = u.Status
	copy.StatusDescription = u.StatusDescription
	copy.ModifyIndex = index

	// Insert the deployment
	if err := txn.Insert("deployment", copy); err != nil {
		return err
	}

	// Update the index
	if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// If the deployment is being marked as complete, set the job to stable.
	if copy.Status == structs.DeploymentStatusSuccessful {
		if err := s.updateJobStabilityImpl(index, copy.Namespace, copy.JobID, copy.JobVersion, true, txn); err != nil {
			return fmt.Errorf("failed to update job stability: %v", err)
		}
	}

	return nil
}

// UpdateJobStability updates the stability of the given job and version to the
// desired status.
func (s *StateStore) UpdateJobStability(index uint64, namespace, jobID string, jobVersion uint64, stable bool) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	if err := s.updateJobStabilityImpl(index, namespace, jobID, jobVersion, stable, txn); err != nil {
		return err
	}

	return txn.Commit()
}

// updateJobStabilityImpl updates the stability of the given job and version
func (s *StateStore) updateJobStabilityImpl(index uint64, namespace, jobID string, jobVersion uint64, stable bool, txn *txn) error {
	// Get the job that is referenced
	job, err := s.jobByIDAndVersionImpl(nil, namespace, jobID, jobVersion, txn)
	if err != nil {
		return err
	}

	// Has already been cleared, nothing to do
	if job == nil {
		return nil
	}

	// If the job already has the desired stability, nothing to do
	if job.Stable == stable {
		return nil
	}

	copy := job.Copy()
	copy.Stable = stable
	return s.upsertJobImpl(index, nil, copy, true, txn)
}

// UpdateDeploymentPromotion is used to promote canaries in a deployment and
// potentially make a evaluation
func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentPromoteRequest) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Retrieve deployment and ensure it is not terminal and is active
	ws := memdb.NewWatchSet()
	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
	if err != nil {
		return err
	} else if deployment == nil {
		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
	} else if !deployment.Active() {
		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
	}

	// Retrieve effected allocations
	iter, err := txn.Get("allocs", "deployment", req.DeploymentID)
	if err != nil {
		return err
	}

	// groupIndex is a map of groups being promoted
	groupIndex := make(map[string]struct{}, len(req.Groups))
	for _, g := range req.Groups {
		groupIndex[g] = struct{}{}
	}

	// canaryIndex is the set of placed canaries in the deployment
	canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups))
	for _, dstate := range deployment.TaskGroups {
		for _, c := range dstate.PlacedCanaries {
			canaryIndex[c] = struct{}{}
		}
	}

	// healthyCounts is a mapping of group to the number of healthy canaries
	healthyCounts := make(map[string]int, len(deployment.TaskGroups))

	// promotable is the set of allocations that we can move from canary to
	// non-canary
	var promotable []*structs.Allocation

	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		alloc := raw.(*structs.Allocation)

		// Check that the alloc is a canary
		if _, ok := canaryIndex[alloc.ID]; !ok {
			continue
		}

		// Check that the canary is part of a group being promoted
		if _, ok := groupIndex[alloc.TaskGroup]; !req.All && !ok {
			continue
		}

		// Ensure the canaries are healthy
		if alloc.TerminalStatus() || !alloc.DeploymentStatus.IsHealthy() {
			continue
		}

		healthyCounts[alloc.TaskGroup]++
		promotable = append(promotable, alloc)
	}

	// Determine if we have enough healthy allocations
	var unhealthyErr multierror.Error
	for tg, dstate := range deployment.TaskGroups {
		if _, ok := groupIndex[tg]; !req.All && !ok {
			continue
		}

		need := dstate.DesiredCanaries
		if need == 0 {
			continue
		}

		if have := healthyCounts[tg]; have < need {
			multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need))
		}
	}

	if err := unhealthyErr.ErrorOrNil(); err != nil {
		return err
	}

	// Update deployment
	copy := deployment.Copy()
	copy.ModifyIndex = index
	for tg, status := range copy.TaskGroups {
		_, ok := groupIndex[tg]
		if !req.All && !ok {
			continue
		}

		// reset the progress deadline
		if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() {
			status.RequireProgressBy = time.Now().Add(status.ProgressDeadline)
		}
		status.Promoted = true
	}

	// If the deployment no longer needs promotion, update its status
	if !copy.RequiresPromotion() && copy.Status == structs.DeploymentStatusRunning {
		copy.StatusDescription = structs.DeploymentStatusDescriptionRunning
	}

	// Insert the deployment
	if err := s.upsertDeploymentImpl(index, copy, txn); err != nil {
		return err
	}

	// Upsert the optional eval
	if req.Eval != nil {
		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
			return err
		}
	}

	// For each promotable allocation remove the canary field
	for _, alloc := range promotable {
		promoted := alloc.Copy()
		promoted.DeploymentStatus.Canary = false
		promoted.DeploymentStatus.ModifyIndex = index
		promoted.ModifyIndex = index
		promoted.AllocModifyIndex = index

		if err := txn.Insert("allocs", promoted); err != nil {
			return fmt.Errorf("alloc insert failed: %v", err)
		}
	}

	// Update the alloc index
	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// UpdateDeploymentAllocHealth is used to update the health of allocations as
// part of the deployment and potentially make a evaluation
func (s *StateStore) UpdateDeploymentAllocHealth(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentAllocHealthRequest) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Retrieve deployment and ensure it is not terminal and is active
	ws := memdb.NewWatchSet()
	deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn)
	if err != nil {
		return err
	} else if deployment == nil {
		return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID)
	} else if !deployment.Active() {
		return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status)
	}

	// Update the health status of each allocation
	if total := len(req.HealthyAllocationIDs) + len(req.UnhealthyAllocationIDs); total != 0 {
		setAllocHealth := func(id string, healthy bool, ts time.Time) error {
			existing, err := txn.First("allocs", "id", id)
			if err != nil {
				return fmt.Errorf("alloc %q lookup failed: %v", id, err)
			}
			if existing == nil {
				return fmt.Errorf("unknown alloc %q", id)
			}

			old := existing.(*structs.Allocation)
			if old.DeploymentID != req.DeploymentID {
				return fmt.Errorf("alloc %q is not part of deployment %q", id, req.DeploymentID)
			}

			// Set the health
			copy := old.Copy()
			if copy.DeploymentStatus == nil {
				copy.DeploymentStatus = &structs.AllocDeploymentStatus{}
			}
			copy.DeploymentStatus.Healthy = pointer.Of(healthy)
			copy.DeploymentStatus.Timestamp = ts
			copy.DeploymentStatus.ModifyIndex = index
			copy.ModifyIndex = index

			if err := s.updateDeploymentWithAlloc(index, copy, old, txn); err != nil {
				return fmt.Errorf("error updating deployment: %v", err)
			}

			if err := txn.Insert("allocs", copy); err != nil {
				return fmt.Errorf("alloc insert failed: %v", err)
			}

			return nil
		}

		for _, id := range req.HealthyAllocationIDs {
			if err := setAllocHealth(id, true, req.Timestamp); err != nil {
				return err
			}
		}
		for _, id := range req.UnhealthyAllocationIDs {
			if err := setAllocHealth(id, false, req.Timestamp); err != nil {
				return err
			}
		}

		// Update the indexes
		if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}

	// Update the deployment status as needed.
	if req.DeploymentUpdate != nil {
		if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil {
			return err
		}
	}

	// Upsert the job if necessary
	if req.Job != nil {
		if err := s.upsertJobImpl(index, nil, req.Job, false, txn); err != nil {
			return err
		}
	}

	// Upsert the optional eval
	if req.Eval != nil {
		if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil {
			return err
		}
	}

	return txn.Commit()
}

// LatestIndex returns the greatest index value for all indexes.
func (s *StateStore) LatestIndex() (uint64, error) {
	indexes, err := s.Indexes()
	if err != nil {
		return 0, err
	}

	var max uint64 = 0
	for {
		raw := indexes.Next()
		if raw == nil {
			break
		}

		// Prepare the request struct
		idx := raw.(*IndexEntry)

		// Determine the max
		if idx.Value > max {
			max = idx.Value
		}
	}

	return max, nil
}

// Index finds the matching index value
func (s *StateStore) Index(name string) (uint64, error) {
	txn := s.db.ReadTxn()

	// Lookup the first matching index
	out, err := txn.First("index", "id", name)
	if err != nil {
		return 0, err
	}
	if out == nil {
		return 0, nil
	}
	return out.(*IndexEntry).Value, nil
}

// Indexes returns an iterator over all the indexes
func (s *StateStore) Indexes() (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire nodes table
	iter, err := txn.Get("index", "id")
	if err != nil {
		return nil, err
	}
	return iter, nil
}

// ReconcileJobSummaries re-creates summaries for all jobs present in the state
// store
func (s *StateStore) ReconcileJobSummaries(index uint64) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// Get all the jobs
	iter, err := txn.Get("jobs", "id")
	if err != nil {
		return err
	}
	// COMPAT: Remove after 0.11
	// Iterate over jobs to build a list of parent jobs and their children
	parentMap := make(map[string][]*structs.Job)
	for {
		rawJob := iter.Next()
		if rawJob == nil {
			break
		}
		job := rawJob.(*structs.Job)
		if job.ParentID != "" {
			children := parentMap[job.ParentID]
			children = append(children, job)
			parentMap[job.ParentID] = children
		}
	}

	// Get all the jobs again
	iter, err = txn.Get("jobs", "id")
	if err != nil {
		return err
	}

	for {
		rawJob := iter.Next()
		if rawJob == nil {
			break
		}
		job := rawJob.(*structs.Job)

		if job.IsParameterized() || job.IsPeriodic() {
			// COMPAT: Remove after 0.11

			// The following block of code fixes incorrect child summaries due to a bug
			// See https://github.com/hashicorp/nomad/issues/3886 for details
			rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID)
			if err != nil {
				return err
			}
			if rawSummary == nil {
				continue
			}

			oldSummary := rawSummary.(*structs.JobSummary)

			// Create an empty summary
			summary := &structs.JobSummary{
				JobID:     job.ID,
				Namespace: job.Namespace,
				Summary:   make(map[string]structs.TaskGroupSummary),
				Children:  &structs.JobChildrenSummary{},
			}

			// Iterate over children of this job if any to fix summary counts
			children := parentMap[job.ID]
			for _, childJob := range children {
				switch childJob.Status {
				case structs.JobStatusPending:
					summary.Children.Pending++
				case structs.JobStatusDead:
					summary.Children.Dead++
				case structs.JobStatusRunning:
					summary.Children.Running++
				}
			}

			// Insert the job summary if its different
			if !reflect.DeepEqual(summary, oldSummary) {
				// Set the create index of the summary same as the job's create index
				// and the modify index to the current index
				summary.CreateIndex = job.CreateIndex
				summary.ModifyIndex = index

				if err := txn.Insert("job_summary", summary); err != nil {
					return fmt.Errorf("error inserting job summary: %v", err)
				}
			}

			// Done with handling a parent job, continue to next
			continue
		}

		// Create a job summary for the job
		summary := &structs.JobSummary{
			JobID:     job.ID,
			Namespace: job.Namespace,
			Summary:   make(map[string]structs.TaskGroupSummary),
		}
		for _, tg := range job.TaskGroups {
			summary.Summary[tg.Name] = structs.TaskGroupSummary{}
		}

		// Find all the allocations for the jobs
		iterAllocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
		if err != nil {
			return err
		}

		// Calculate the summary for the job
		for {
			rawAlloc := iterAllocs.Next()
			if rawAlloc == nil {
				break
			}
			alloc := rawAlloc.(*structs.Allocation)

			// Ignore the allocation if it doesn't belong to the currently
			// registered job. The allocation is checked because of issue #2304
			if alloc.Job == nil || alloc.Job.CreateIndex != job.CreateIndex {
				continue
			}

			tg := summary.Summary[alloc.TaskGroup]
			switch alloc.ClientStatus {
			case structs.AllocClientStatusFailed:
				tg.Failed += 1
			case structs.AllocClientStatusLost:
				tg.Lost += 1
			case structs.AllocClientStatusUnknown:
				tg.Unknown += 1
			case structs.AllocClientStatusComplete:
				tg.Complete += 1
			case structs.AllocClientStatusRunning:
				tg.Running += 1
			case structs.AllocClientStatusPending:
				tg.Starting += 1
			default:
				s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID)
			}
			summary.Summary[alloc.TaskGroup] = tg
		}

		// Set the create index of the summary same as the job's create index
		// and the modify index to the current index
		summary.CreateIndex = job.CreateIndex
		summary.ModifyIndex = index

		// Insert the job summary
		if err := txn.Insert("job_summary", summary); err != nil {
			return fmt.Errorf("error inserting job summary: %v", err)
		}
	}

	// Update the indexes table for job summary
	if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID.
// It takes a map of job IDs to an optional forceStatus string. It returns an
// error if the job doesn't exist or setJobStatus fails.
func (s *StateStore) setJobStatuses(index uint64, txn *txn,
	jobs map[structs.NamespacedID]string, evalDelete bool) error {
	for tuple, forceStatus := range jobs {

		existing, err := txn.First("jobs", "id", tuple.Namespace, tuple.ID)
		if err != nil {
			return fmt.Errorf("job lookup failed: %v", err)
		}

		if existing == nil {
			continue
		}

		if err := s.setJobStatus(index, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil {
			return err
		}

	}

	return nil
}

// setJobStatus sets the status of the job by looking up associated evaluations
// and allocations. evalDelete should be set to true if setJobStatus is being
// called because an evaluation is being deleted (potentially because of garbage
// collection). If forceStatus is non-empty, the job's status will be set to the
// passed status.
func (s *StateStore) setJobStatus(index uint64, txn *txn,
	job *structs.Job, evalDelete bool, forceStatus string) error {

	// Capture the current status so we can check if there is a change
	oldStatus := job.Status
	newStatus := forceStatus

	// If forceStatus is not set, compute the jobs status.
	if forceStatus == "" {
		var err error
		newStatus, err = s.getJobStatus(txn, job, evalDelete)
		if err != nil {
			return err
		}
	}

	// Fast-path if the job has not changed.
	if oldStatus == newStatus {
		return nil
	}

	// Copy and update the existing job
	updated := job.Copy()
	updated.Status = newStatus
	updated.ModifyIndex = index

	// Insert the job
	if err := txn.Insert("jobs", updated); err != nil {
		return fmt.Errorf("job insert failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	// Update the children summary
	if err := s.setJobSummary(txn, updated, index, oldStatus, newStatus); err != nil {
		return fmt.Errorf("job summary update failed %w", err)
	}
	return nil
}

func (s *StateStore) setJobSummary(txn *txn, updated *structs.Job, index uint64, oldStatus, newStatus string) error {
	if updated.ParentID == "" {
		return nil
	}

	// Try to update the summary of the parent job summary
	summaryRaw, err := txn.First("job_summary", "id", updated.Namespace, updated.ParentID)
	if err != nil {
		return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
	}

	// Only continue if the summary exists. It could not exist if the parent
	// job was removed
	if summaryRaw != nil {
		existing := summaryRaw.(*structs.JobSummary)
		pSummary := existing.Copy()
		if pSummary.Children == nil {
			pSummary.Children = new(structs.JobChildrenSummary)
		}

		// Determine the transition and update the correct fields
		children := pSummary.Children

		// Decrement old status
		if oldStatus != "" {
			switch oldStatus {
			case structs.JobStatusPending:
				children.Pending--
			case structs.JobStatusRunning:
				children.Running--
			case structs.JobStatusDead:
				children.Dead--
			default:
				return fmt.Errorf("unknown old job status %q", oldStatus)
			}
		}

		// Increment new status
		switch newStatus {
		case structs.JobStatusPending:
			children.Pending++
		case structs.JobStatusRunning:
			children.Running++
		case structs.JobStatusDead:
			children.Dead++
		default:
			return fmt.Errorf("unknown new job status %q", newStatus)
		}

		// Update the index
		pSummary.ModifyIndex = index

		// Insert the summary
		if err := txn.Insert("job_summary", pSummary); err != nil {
			return fmt.Errorf("job summary insert failed: %v", err)
		}
		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}
	return nil
}

func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) (string, error) {
	// System, Periodic and Parameterized jobs are running until explicitly
	// stopped.
	if job.Type == structs.JobTypeSystem ||
		job.IsParameterized() ||
		job.IsPeriodic() {
		if job.Stop {
			return structs.JobStatusDead, nil
		}
		return structs.JobStatusRunning, nil
	}

	allocs, err := txn.Get("allocs", "job", job.Namespace, job.ID)
	if err != nil {
		return "", err
	}

	// If there is a non-terminal allocation, the job is running.
	hasAlloc := false
	for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() {
		hasAlloc = true
		if !alloc.(*structs.Allocation).TerminalStatus() {
			return structs.JobStatusRunning, nil
		}
	}

	evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID)
	if err != nil {
		return "", err
	}

	hasEval := false
	for raw := evals.Next(); raw != nil; raw = evals.Next() {
		e := raw.(*structs.Evaluation)

		// Filter non-exact matches
		if e.JobID != job.ID {
			continue
		}

		hasEval = true
		if !e.TerminalStatus() {
			return structs.JobStatusPending, nil
		}
	}

	// The job is dead if all the allocations and evals are terminal or if there
	// are no evals because of garbage collection.
	if evalDelete || hasEval || hasAlloc {
		return structs.JobStatusDead, nil
	}

	return structs.JobStatusPending, nil
}

// updateSummaryWithJob creates or updates job summaries when new jobs are
// upserted or existing ones are updated
func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job,
	txn *txn) error {

	// Update the job summary
	summaryRaw, err := txn.First("job_summary", "id", job.Namespace, job.ID)
	if err != nil {
		return fmt.Errorf("job summary lookup failed: %v", err)
	}

	// Get the summary or create if necessary
	var summary *structs.JobSummary
	hasSummaryChanged := false
	if summaryRaw != nil {
		summary = summaryRaw.(*structs.JobSummary).Copy()
	} else {
		summary = &structs.JobSummary{
			JobID:       job.ID,
			Namespace:   job.Namespace,
			Summary:     make(map[string]structs.TaskGroupSummary),
			Children:    new(structs.JobChildrenSummary),
			CreateIndex: index,
		}
		hasSummaryChanged = true
	}

	for _, tg := range job.TaskGroups {
		if _, ok := summary.Summary[tg.Name]; !ok {
			newSummary := structs.TaskGroupSummary{
				Complete: 0,
				Failed:   0,
				Running:  0,
				Starting: 0,
			}
			summary.Summary[tg.Name] = newSummary
			hasSummaryChanged = true
		}
	}

	// The job summary has changed, so update the modify index.
	if hasSummaryChanged {
		summary.ModifyIndex = index

		// Update the indexes table for job summary
		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
		if err := txn.Insert("job_summary", summary); err != nil {
			return err
		}
	}

	return nil
}

// updateJobScalingPolicies upserts any scaling policies contained in the job and removes
// any previous scaling policies that were removed from the job
func (s *StateStore) updateJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error {

	ws := memdb.NewWatchSet()

	scalingPolicies := job.GetScalingPolicies()
	newTargets := map[string]bool{}
	for _, p := range scalingPolicies {
		newTargets[p.JobKey()] = true
	}
	// find existing policies that need to be deleted
	deletedPolicies := []string{}
	iter, err := s.ScalingPoliciesByJobTxn(ws, job.Namespace, job.ID, txn)
	if err != nil {
		return fmt.Errorf("ScalingPoliciesByJob lookup failed: %v", err)
	}
	for raw := iter.Next(); raw != nil; raw = iter.Next() {
		oldPolicy := raw.(*structs.ScalingPolicy)
		if !newTargets[oldPolicy.JobKey()] {
			deletedPolicies = append(deletedPolicies, oldPolicy.ID)
		}
	}
	err = s.DeleteScalingPoliciesTxn(index, deletedPolicies, txn)
	if err != nil {
		return fmt.Errorf("DeleteScalingPolicies of removed policies failed: %v", err)
	}

	err = s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn)
	if err != nil {
		return fmt.Errorf("UpsertScalingPolicies of policies failed: %v", err)
	}

	return nil
}

// updateJobSubmission stores the original job source and variables associated that the
// job structure originates from. It is up to the job submitter to include the source
// material, and as such sub may be nil, in which case nothing is stored.
func (s *StateStore) updateJobSubmission(index uint64, sub *structs.JobSubmission, namespace, jobID string, version uint64, txn *txn) error {
	switch {
	case sub == nil:
		return nil
	case namespace == "":
		return errors.New("job_submission requires a namespace")
	case jobID == "":
		return errors.New("job_submission requires a jobID")
	default:
		sub.Namespace = namespace
		sub.JobID = jobID
		sub.JobModifyIndex = index
		sub.Version = version
	}

	// insert the job submission
	if err := txn.Insert("job_submission", sub); err != nil {
		return err
	}

	// prune old job submissions
	return s.pruneJobSubmissions(namespace, jobID, txn)
}

func (s *StateStore) pruneJobSubmissions(namespace, jobID string, txn *txn) error {
	// although the number of tracked submissions is the same as the number of
	// tracked job versions, do not assume a 1:1 correlation, as there could be
	// holes in the submissions (or none at all)
	limit := structs.JobTrackedVersions

	iter, err := txn.Get("job_submission", "by_jobID", namespace, jobID)
	if err != nil {
		return err
	}

	// lookup each stored submission's (modify index, version)
	stored := make([]lang.Pair[uint64, uint64], 0, limit+1)
	for next := iter.Next(); next != nil; next = iter.Next() {
		sub := next.(*structs.JobSubmission)
		stored = append(stored, lang.Pair[uint64, uint64]{First: sub.JobModifyIndex, Second: sub.Version})
	}

	// if we are still below the limit, nothing to do
	if len(stored) <= limit {
		return nil
	}

	// sort by job modify index descending so we can just keep the first N
	slices.SortFunc(stored, func(a, b lang.Pair[uint64, uint64]) bool {
		return a.First > b.First
	})

	// remove the outdated submission versions
	for _, sub := range stored[limit:] {
		if err = txn.Delete("job_submission", &structs.JobSubmission{
			Namespace: namespace,
			JobID:     jobID,
			Version:   sub.Second,
		}); err != nil {
			return err
		}
	}
	return nil
}

// updateJobCSIPlugins runs on job update, and indexes the job in the plugin
func (s *StateStore) updateJobCSIPlugins(index uint64, job, prev *structs.Job, txn *txn) error {
	plugIns := make(map[string]*structs.CSIPlugin)

	upsertFn := func(job *structs.Job, delete bool) error {
		for _, tg := range job.TaskGroups {
			for _, t := range tg.Tasks {
				if t.CSIPluginConfig == nil {
					continue
				}

				plugIn, ok := plugIns[t.CSIPluginConfig.ID]
				if !ok {
					p, err := s.CSIPluginByIDTxn(txn, nil, t.CSIPluginConfig.ID)
					if err != nil {
						return err
					}
					if p == nil {
						plugIn = structs.NewCSIPlugin(t.CSIPluginConfig.ID, index)
					} else {
						plugIn = p.Copy()
						plugIn.ModifyIndex = index
					}
					plugIns[plugIn.ID] = plugIn
				}

				if delete {
					plugIn.DeleteJob(job, nil)
				} else {
					plugIn.AddJob(job, nil)
				}
			}
		}

		return nil
	}

	if prev != nil {
		err := upsertFn(prev, true)
		if err != nil {
			return err
		}
	}

	err := upsertFn(job, false)
	if err != nil {
		return err
	}

	for _, plugIn := range plugIns {
		err = txn.Insert("csi_plugins", plugIn)
		if err != nil {
			return fmt.Errorf("csi_plugins insert error: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// updateDeploymentWithAlloc is used to update the deployment state associated
// with the given allocation. The passed alloc may be updated if the deployment
// status has changed to capture the modify index at which it has changed.
func (s *StateStore) updateDeploymentWithAlloc(index uint64, alloc, existing *structs.Allocation, txn *txn) error {
	// Nothing to do if the allocation is not associated with a deployment
	if alloc.DeploymentID == "" {
		return nil
	}

	// Get the deployment
	ws := memdb.NewWatchSet()
	deployment, err := s.deploymentByIDImpl(ws, alloc.DeploymentID, txn)
	if err != nil {
		return err
	}
	if deployment == nil {
		return nil
	}

	// Retrieve the deployment state object
	_, ok := deployment.TaskGroups[alloc.TaskGroup]
	if !ok {
		// If the task group isn't part of the deployment, the task group wasn't
		// part of a rolling update so nothing to do
		return nil
	}

	// Do not modify in-place. Instead keep track of what must be done
	placed := 0
	healthy := 0
	unhealthy := 0

	// If there was no existing allocation, this is a placement and we increment
	// the placement
	existingHealthSet := existing != nil && existing.DeploymentStatus.HasHealth()
	allocHealthSet := alloc.DeploymentStatus.HasHealth()
	if existing == nil || existing.DeploymentID != alloc.DeploymentID {
		placed++
	} else if !existingHealthSet && allocHealthSet {
		if *alloc.DeploymentStatus.Healthy {
			healthy++
		} else {
			unhealthy++
		}
	} else if existingHealthSet && allocHealthSet {
		// See if it has gone from healthy to unhealthy
		if *existing.DeploymentStatus.Healthy && !*alloc.DeploymentStatus.Healthy {
			healthy--
			unhealthy++
		}
	}

	// Nothing to do
	if placed == 0 && healthy == 0 && unhealthy == 0 {
		return nil
	}

	// Update the allocation's deployment status modify index
	if alloc.DeploymentStatus != nil && healthy+unhealthy != 0 {
		alloc.DeploymentStatus.ModifyIndex = index
	}

	// Create a copy of the deployment object
	deploymentCopy := deployment.Copy()
	deploymentCopy.ModifyIndex = index

	dstate := deploymentCopy.TaskGroups[alloc.TaskGroup]
	dstate.PlacedAllocs += placed
	dstate.HealthyAllocs += healthy
	dstate.UnhealthyAllocs += unhealthy

	// Ensure PlacedCanaries accurately reflects the alloc canary status
	if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary {
		found := false
		for _, canary := range dstate.PlacedCanaries {
			if alloc.ID == canary {
				found = true
				break
			}
		}
		if !found {
			dstate.PlacedCanaries = append(dstate.PlacedCanaries, alloc.ID)
		}
	}

	// Update the progress deadline
	if pd := dstate.ProgressDeadline; pd != 0 {
		// If we are the first placed allocation for the deployment start the progress deadline.
		if placed != 0 && dstate.RequireProgressBy.IsZero() {
			// Use modify time instead of create time because we may in-place
			// update the allocation to be part of a new deployment.
			dstate.RequireProgressBy = time.Unix(0, alloc.ModifyTime).Add(pd)
		} else if healthy != 0 {
			if d := alloc.DeploymentStatus.Timestamp.Add(pd); d.After(dstate.RequireProgressBy) {
				dstate.RequireProgressBy = d
			}
		}
	}

	// Upsert the deployment
	if err := s.upsertDeploymentImpl(index, deploymentCopy, txn); err != nil {
		return err
	}

	return nil
}

// updateSummaryWithAlloc updates the job summary when allocations are updated
// or inserted
func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation,
	existingAlloc *structs.Allocation, txn *txn) error {

	// We don't have to update the summary if the job is missing
	if alloc.Job == nil {
		return nil
	}

	summaryRaw, err := txn.First("job_summary", "id", alloc.Namespace, alloc.JobID)
	if err != nil {
		return fmt.Errorf("unable to lookup job summary for job id %q in namespace %q: %v", alloc.JobID, alloc.Namespace, err)
	}

	if summaryRaw == nil {
		// Check if the job is de-registered
		rawJob, err := txn.First("jobs", "id", alloc.Namespace, alloc.JobID)
		if err != nil {
			return fmt.Errorf("unable to query job: %v", err)
		}

		// If the job is de-registered then we skip updating it's summary
		if rawJob == nil {
			return nil
		}

		return fmt.Errorf("job summary for job %q in namespace %q is not present", alloc.JobID, alloc.Namespace)
	}

	// Get a copy of the existing summary
	jobSummary := summaryRaw.(*structs.JobSummary).Copy()

	// Not updating the job summary because the allocation doesn't belong to the
	// currently registered job
	if jobSummary.CreateIndex != alloc.Job.CreateIndex {
		return nil
	}

	tgSummary, ok := jobSummary.Summary[alloc.TaskGroup]
	if !ok {
		return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup)
	}

	summaryChanged := false
	if existingAlloc == nil {
		switch alloc.DesiredStatus {
		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
			s.logger.Error("new allocation inserted into state store with bad desired status",
				"alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
		}
		switch alloc.ClientStatus {
		case structs.AllocClientStatusPending:
			tgSummary.Starting += 1
			if tgSummary.Queued > 0 {
				tgSummary.Queued -= 1
			}
			summaryChanged = true
		case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed,
			structs.AllocClientStatusComplete:
			s.logger.Error("new allocation inserted into state store with bad client status",
				"alloc_id", alloc.ID, "client_status", alloc.ClientStatus)
		}
	} else if existingAlloc.ClientStatus != alloc.ClientStatus {
		// Incrementing the client of the bin of the current state
		switch alloc.ClientStatus {
		case structs.AllocClientStatusRunning:
			tgSummary.Running += 1
		case structs.AllocClientStatusFailed:
			tgSummary.Failed += 1
		case structs.AllocClientStatusPending:
			tgSummary.Starting += 1
		case structs.AllocClientStatusComplete:
			tgSummary.Complete += 1
		case structs.AllocClientStatusLost:
			tgSummary.Lost += 1
		case structs.AllocClientStatusUnknown:
			tgSummary.Unknown += 1
		}

		// Decrementing the count of the bin of the last state
		switch existingAlloc.ClientStatus {
		case structs.AllocClientStatusRunning:
			if tgSummary.Running > 0 {
				tgSummary.Running -= 1
			}
		case structs.AllocClientStatusPending:
			if tgSummary.Starting > 0 {
				tgSummary.Starting -= 1
			}
		case structs.AllocClientStatusLost:
			if tgSummary.Lost > 0 {
				tgSummary.Lost -= 1
			}
		case structs.AllocClientStatusUnknown:
			if tgSummary.Unknown > 0 {
				tgSummary.Unknown -= 1
			}
		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
		default:
			s.logger.Error("invalid old client status for allocation",
				"alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus)
		}
		summaryChanged = true
	}
	jobSummary.Summary[alloc.TaskGroup] = tgSummary

	if summaryChanged {
		jobSummary.ModifyIndex = index

		s.updatePluginWithJobSummary(index, jobSummary, alloc, txn)

		// Update the indexes table for job summary
		if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}

		if err := txn.Insert("job_summary", jobSummary); err != nil {
			return fmt.Errorf("updating job summary failed: %v", err)
		}
	}

	return nil
}

// updatePluginForTerminalAlloc updates the CSI plugins for an alloc when the
// allocation is updated or inserted with a terminal server status.
func (s *StateStore) updatePluginForTerminalAlloc(index uint64, alloc *structs.Allocation,
	txn *txn) error {

	if !alloc.ServerTerminalStatus() {
		return nil
	}

	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
	for _, t := range tg.Tasks {
		if t.CSIPluginConfig != nil {
			pluginID := t.CSIPluginConfig.ID
			plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID)
			if err != nil {
				return err
			}
			if plug == nil {
				// plugin may not have been created because it never
				// became healthy, just move on
				return nil
			}
			plug = plug.Copy()
			err = plug.DeleteAlloc(alloc.ID, alloc.NodeID)
			if err != nil {
				return err
			}
			err = updateOrGCPlugin(index, txn, plug)
			if err != nil {
				return err
			}
		}
	}

	return nil
}

// updatePluginWithJobSummary updates the CSI plugins for a job when the
// job summary is updated by an alloc
func (s *StateStore) updatePluginWithJobSummary(index uint64, summary *structs.JobSummary, alloc *structs.Allocation,
	txn *txn) error {

	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
	if tg == nil {
		return nil
	}

	for _, t := range tg.Tasks {
		if t.CSIPluginConfig != nil {
			pluginID := t.CSIPluginConfig.ID
			plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID)
			if err != nil {
				return err
			}
			if plug == nil {
				plug = structs.NewCSIPlugin(pluginID, index)
			} else {
				plug = plug.Copy()
			}

			plug.UpdateExpectedWithJob(alloc.Job, summary,
				alloc.Job.Status == structs.JobStatusDead)

			err = updateOrGCPlugin(index, txn, plug)
			if err != nil {
				return err
			}
		}
	}

	return nil
}

// UpsertACLPolicies is used to create or update a set of ACL policies
func (s *StateStore) UpsertACLPolicies(msgType structs.MessageType, index uint64, policies []*structs.ACLPolicy) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	for _, policy := range policies {
		// Ensure the policy hash is non-nil. This should be done outside the state store
		// for performance reasons, but we check here for defense in depth.
		if len(policy.Hash) == 0 {
			policy.SetHash()
		}

		// Check if the policy already exists
		existing, err := txn.First("acl_policy", "id", policy.Name)
		if err != nil {
			return fmt.Errorf("policy lookup failed: %v", err)
		}

		// Update all the indexes
		if existing != nil {
			policy.CreateIndex = existing.(*structs.ACLPolicy).CreateIndex
			policy.ModifyIndex = index
		} else {
			policy.CreateIndex = index
			policy.ModifyIndex = index
		}

		// Update the policy
		if err := txn.Insert("acl_policy", policy); err != nil {
			return fmt.Errorf("upserting policy failed: %v", err)
		}
	}

	// Update the indexes tabl
	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// DeleteACLPolicies deletes the policies with the given names
func (s *StateStore) DeleteACLPolicies(msgType structs.MessageType, index uint64, names []string) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Delete the policy
	for _, name := range names {
		if _, err := txn.DeleteAll("acl_policy", "id", name); err != nil {
			return fmt.Errorf("deleting acl policy failed: %v", err)
		}
	}
	if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// ACLPolicyByName is used to lookup a policy by name
func (s *StateStore) ACLPolicyByName(ws memdb.WatchSet, name string) (*structs.ACLPolicy, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("acl_policy", "id", name)
	if err != nil {
		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.ACLPolicy), nil
	}
	return nil, nil
}

// ACLPolicyByNamePrefix is used to lookup policies by prefix
func (s *StateStore) ACLPolicyByNamePrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("acl_policy", "id_prefix", prefix)
	if err != nil {
		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
	}
	ws.Add(iter.WatchCh())

	return iter, nil
}

// ACLPolicyByJob is used to lookup policies that have been attached to a
// specific job
func (s *StateStore) ACLPolicyByJob(ws memdb.WatchSet, ns, jobID string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("acl_policy", "job_prefix", ns, jobID)
	if err != nil {
		return nil, fmt.Errorf("acl policy lookup failed: %v", err)
	}
	ws.Add(iter.WatchCh())

	return iter, nil
}

// ACLPolicies returns an iterator over all the acl policies
func (s *StateStore) ACLPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire table
	iter, err := txn.Get("acl_policy", "id")
	if err != nil {
		return nil, err
	}
	ws.Add(iter.WatchCh())
	return iter, nil
}

// UpsertACLTokens is used to create or update a set of ACL tokens
func (s *StateStore) UpsertACLTokens(msgType structs.MessageType, index uint64, tokens []*structs.ACLToken) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	for _, token := range tokens {
		// Ensure the policy hash is non-nil. This should be done outside the state store
		// for performance reasons, but we check here for defense in depth.
		if len(token.Hash) == 0 {
			token.SetHash()
		}

		// Check if the token already exists
		existing, err := txn.First("acl_token", "id", token.AccessorID)
		if err != nil {
			return fmt.Errorf("token lookup failed: %v", err)
		}

		// Update all the indexes
		if existing != nil {
			existTK := existing.(*structs.ACLToken)
			token.CreateIndex = existTK.CreateIndex
			token.ModifyIndex = index

			// Do not allow SecretID or create time to change
			token.SecretID = existTK.SecretID
			token.CreateTime = existTK.CreateTime

		} else {
			token.CreateIndex = index
			token.ModifyIndex = index
		}

		// Update the token
		if err := txn.Insert("acl_token", token); err != nil {
			return fmt.Errorf("upserting token failed: %v", err)
		}
	}

	// Update the indexes table
	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// DeleteACLTokens deletes the tokens with the given accessor ids
func (s *StateStore) DeleteACLTokens(msgType structs.MessageType, index uint64, ids []string) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Delete the tokens
	for _, id := range ids {
		if _, err := txn.DeleteAll("acl_token", "id", id); err != nil {
			return fmt.Errorf("deleting acl token failed: %v", err)
		}
	}
	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// ACLTokenByAccessorID is used to lookup a token by accessor ID
func (s *StateStore) ACLTokenByAccessorID(ws memdb.WatchSet, id string) (*structs.ACLToken, error) {
	if id == "" {
		return nil, fmt.Errorf("acl token lookup failed: missing accessor id")
	}

	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("acl_token", "id", id)
	if err != nil {
		return nil, fmt.Errorf("acl token lookup failed: %v", err)
	}
	ws.Add(watchCh)

	// If the existing token is nil, this indicates it does not exist in state.
	if existing == nil {
		return nil, nil
	}

	// Assert the token type which allows us to perform additional work on the
	// token that is needed before returning the call.
	token := existing.(*structs.ACLToken)

	// Handle potential staleness of ACL role links.
	if token, err = s.fixTokenRoleLinks(txn, token); err != nil {
		return nil, err
	}
	return token, nil
}

// ACLTokenBySecretID is used to lookup a token by secret ID
func (s *StateStore) ACLTokenBySecretID(ws memdb.WatchSet, secretID string) (*structs.ACLToken, error) {
	if secretID == "" {
		return nil, fmt.Errorf("acl token lookup failed: missing secret id")
	}

	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("acl_token", "secret", secretID)
	if err != nil {
		return nil, fmt.Errorf("acl token lookup failed: %v", err)
	}
	ws.Add(watchCh)

	// If the existing token is nil, this indicates it does not exist in state.
	if existing == nil {
		return nil, nil
	}

	// Assert the token type which allows us to perform additional work on the
	// token that is needed before returning the call.
	token := existing.(*structs.ACLToken)

	// Handle potential staleness of ACL role links.
	if token, err = s.fixTokenRoleLinks(txn, token); err != nil {
		return nil, err
	}
	return token, nil
}

// ACLTokenByAccessorIDPrefix is used to lookup tokens by prefix
func (s *StateStore) ACLTokenByAccessorIDPrefix(ws memdb.WatchSet, prefix string, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var iter memdb.ResultIterator
	var err error

	switch sort {
	case SortReverse:
		iter, err = txn.GetReverse("acl_token", "id_prefix", prefix)
	default:
		iter, err = txn.Get("acl_token", "id_prefix", prefix)
	}
	if err != nil {
		return nil, fmt.Errorf("acl token lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

// ACLTokens returns an iterator over all the tokens
func (s *StateStore) ACLTokens(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var iter memdb.ResultIterator
	var err error

	switch sort {
	case SortReverse:
		iter, err = txn.GetReverse("acl_token", "create")
	default:
		iter, err = txn.Get("acl_token", "create")
	}
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

// ACLTokensByGlobal returns an iterator over all the tokens filtered by global value
func (s *StateStore) ACLTokensByGlobal(ws memdb.WatchSet, globalVal bool, sort SortOption) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	var iter memdb.ResultIterator
	var err error

	// Walk the entire table
	switch sort {
	case SortReverse:
		iter, err = txn.GetReverse("acl_token", "global", globalVal)
	default:
		iter, err = txn.Get("acl_token", "global", globalVal)
	}
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

// CanBootstrapACLToken checks if bootstrapping is possible and returns the reset index
func (s *StateStore) CanBootstrapACLToken() (bool, uint64, error) {
	txn := s.db.ReadTxn()

	// Lookup the bootstrap sentinel
	out, err := txn.First("index", "id", "acl_token_bootstrap")
	if err != nil {
		return false, 0, err
	}

	// No entry, we haven't bootstrapped yet
	if out == nil {
		return true, 0, nil
	}

	// Return the reset index if we've already bootstrapped
	return false, out.(*IndexEntry).Value, nil
}

// BootstrapACLTokens is used to create an initial ACL token.
func (s *StateStore) BootstrapACLTokens(msgType structs.MessageType, index uint64, resetIndex uint64, token *structs.ACLToken) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// Check if we have already done a bootstrap
	existing, err := txn.First("index", "id", "acl_token_bootstrap")
	if err != nil {
		return fmt.Errorf("bootstrap check failed: %v", err)
	}
	if existing != nil {
		if resetIndex == 0 {
			return fmt.Errorf("ACL bootstrap already done")
		} else if resetIndex != existing.(*IndexEntry).Value {
			return fmt.Errorf("Invalid reset index for ACL bootstrap")
		}
	}

	// Update the Create/Modify time
	token.CreateIndex = index
	token.ModifyIndex = index

	// Insert the token
	if err := txn.Insert("acl_token", token); err != nil {
		return fmt.Errorf("upserting token failed: %v", err)
	}

	// Update the indexes table, prevents future bootstrap until reset
	if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	if err := txn.Insert("index", &IndexEntry{"acl_token_bootstrap", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// UpsertOneTimeToken is used to create or update a set of ACL
// tokens. Validating that we're not upserting an already-expired token is
// made the responsibility of the caller to facilitate testing.
func (s *StateStore) UpsertOneTimeToken(msgType structs.MessageType, index uint64, token *structs.OneTimeToken) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	// we expect the RPC call to set the ExpiresAt
	if token.ExpiresAt.IsZero() {
		return fmt.Errorf("one-time token must have an ExpiresAt time")
	}

	// Update all the indexes
	token.CreateIndex = index
	token.ModifyIndex = index

	// Create the token
	if err := txn.Insert("one_time_token", token); err != nil {
		return fmt.Errorf("upserting one-time token failed: %v", err)
	}

	// Update the indexes table
	if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// DeleteOneTimeTokens deletes the tokens with the given ACLToken Accessor IDs
func (s *StateStore) DeleteOneTimeTokens(msgType structs.MessageType, index uint64, ids []string) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	var deleted int
	for _, id := range ids {
		d, err := txn.DeleteAll("one_time_token", "id", id)
		if err != nil {
			return fmt.Errorf("deleting one-time token failed: %v", err)
		}
		deleted += d
	}

	if deleted > 0 {
		if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}
	return txn.Commit()
}

// ExpireOneTimeTokens deletes tokens that have expired
func (s *StateStore) ExpireOneTimeTokens(msgType structs.MessageType, index uint64, timestamp time.Time) error {
	txn := s.db.WriteTxnMsgT(msgType, index)
	defer txn.Abort()

	iter, err := s.oneTimeTokensExpiredTxn(txn, nil, timestamp)
	if err != nil {
		return err
	}

	var deleted int
	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		ott, ok := raw.(*structs.OneTimeToken)
		if !ok || ott == nil {
			return fmt.Errorf("could not decode one-time token")
		}
		d, err := txn.DeleteAll("one_time_token", "secret", ott.OneTimeSecretID)
		if err != nil {
			return fmt.Errorf("deleting one-time token failed: %v", err)
		}
		deleted += d
	}

	if deleted > 0 {
		if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}
	return txn.Commit()
}

// oneTimeTokensExpiredTxn returns an iterator over all expired one-time tokens
func (s *StateStore) oneTimeTokensExpiredTxn(txn *txn, ws memdb.WatchSet, timestamp time.Time) (memdb.ResultIterator, error) {
	iter, err := txn.Get("one_time_token", "id")
	if err != nil {
		return nil, fmt.Errorf("one-time token lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())
	iter = memdb.NewFilterIterator(iter, expiredOneTimeTokenFilter(timestamp))
	return iter, nil
}

// OneTimeTokenBySecret is used to lookup a token by secret
func (s *StateStore) OneTimeTokenBySecret(ws memdb.WatchSet, secret string) (*structs.OneTimeToken, error) {
	if secret == "" {
		return nil, fmt.Errorf("one-time token lookup failed: missing secret")
	}

	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("one_time_token", "secret", secret)
	if err != nil {
		return nil, fmt.Errorf("one-time token lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.OneTimeToken), nil
	}
	return nil, nil
}

// expiredOneTimeTokenFilter returns a filter function that returns only
// expired one-time tokens
func expiredOneTimeTokenFilter(now time.Time) func(interface{}) bool {
	return func(raw interface{}) bool {
		ott, ok := raw.(*structs.OneTimeToken)
		if !ok {
			return true
		}

		return ott.ExpiresAt.After(now)
	}
}

// SchedulerConfig is used to get the current Scheduler configuration.
func (s *StateStore) SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error) {
	tx := s.db.ReadTxn()
	defer tx.Abort()
	return s.schedulerConfigTxn(tx)
}

func (s *StateStore) schedulerConfigTxn(txn *txn) (uint64, *structs.SchedulerConfiguration, error) {

	// Get the scheduler config
	c, err := txn.First("scheduler_config", "id")
	if err != nil {
		return 0, nil, fmt.Errorf("failed scheduler config lookup: %s", err)
	}

	config, ok := c.(*structs.SchedulerConfiguration)
	if !ok {
		return 0, nil, nil
	}

	return config.ModifyIndex, config, nil
}

// SchedulerSetConfig is used to set the current Scheduler configuration.
func (s *StateStore) SchedulerSetConfig(index uint64, config *structs.SchedulerConfiguration) error {
	tx := s.db.WriteTxn(index)
	defer tx.Abort()

	s.schedulerSetConfigTxn(index, tx, config)

	return tx.Commit()
}

func (s *StateStore) ClusterMetadata(ws memdb.WatchSet) (*structs.ClusterMetadata, error) {
	txn := s.db.ReadTxn()
	defer txn.Abort()

	// Get the cluster metadata
	watchCh, m, err := txn.FirstWatch("cluster_meta", "id")
	if err != nil {
		return nil, fmt.Errorf("failed cluster metadata lookup: %w", err)
	}
	ws.Add(watchCh)

	if m != nil {
		return m.(*structs.ClusterMetadata), nil
	}

	return nil, nil
}

func (s *StateStore) ClusterSetMetadata(index uint64, meta *structs.ClusterMetadata) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	if err := s.setClusterMetadata(txn, meta); err != nil {
		return fmt.Errorf("set cluster metadata failed: %w", err)
	}

	return txn.Commit()
}

// WithWriteTransaction executes the passed function within a write transaction,
// and returns its result.  If the invocation returns no error, the transaction
// is committed; otherwise, it's aborted.
func (s *StateStore) WithWriteTransaction(msgType structs.MessageType, index uint64, fn func(Txn) error) error {
	tx := s.db.WriteTxnMsgT(msgType, index)
	defer tx.Abort()

	err := fn(tx)
	if err == nil {
		return tx.Commit()
	}
	return err
}

// SchedulerCASConfig is used to update the scheduler configuration with a
// given Raft index. If the CAS index specified is not equal to the last observed index
// for the config, then the call is a noop.
func (s *StateStore) SchedulerCASConfig(index, cidx uint64, config *structs.SchedulerConfiguration) (bool, error) {
	tx := s.db.WriteTxn(index)
	defer tx.Abort()

	// Check for an existing config
	existing, err := tx.First("scheduler_config", "id")
	if err != nil {
		return false, fmt.Errorf("failed scheduler config lookup: %s", err)
	}

	// If the existing index does not match the provided CAS
	// index arg, then we shouldn't update anything and can safely
	// return early here.
	e, ok := existing.(*structs.SchedulerConfiguration)
	if !ok || (e != nil && e.ModifyIndex != cidx) {
		return false, nil
	}

	s.schedulerSetConfigTxn(index, tx, config)

	if err := tx.Commit(); err != nil {
		return false, err
	}
	return true, nil
}

func (s *StateStore) schedulerSetConfigTxn(idx uint64, tx *txn, config *structs.SchedulerConfiguration) error {
	// Check for an existing config
	existing, err := tx.First("scheduler_config", "id")
	if err != nil {
		return fmt.Errorf("failed scheduler config lookup: %s", err)
	}

	// Set the indexes.
	if existing != nil {
		config.CreateIndex = existing.(*structs.SchedulerConfiguration).CreateIndex
	} else {
		config.CreateIndex = idx
	}
	config.ModifyIndex = idx

	if err := tx.Insert("scheduler_config", config); err != nil {
		return fmt.Errorf("failed updating scheduler config: %s", err)
	}
	return nil
}

func (s *StateStore) setClusterMetadata(txn *txn, meta *structs.ClusterMetadata) error {
	// Check for an existing config, if it exists, verify that the cluster ID matches
	existing, err := txn.First("cluster_meta", "id")
	if err != nil {
		return fmt.Errorf("failed cluster meta lookup: %v", err)
	}

	if existing != nil {
		existingClusterID := existing.(*structs.ClusterMetadata).ClusterID
		if meta.ClusterID != existingClusterID && existingClusterID != "" {
			// there is a bug in cluster ID detection
			return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID)
		}
	}

	// update is technically a noop, unless someday we add more / mutable fields
	if err := txn.Insert("cluster_meta", meta); err != nil {
		return fmt.Errorf("set cluster metadata failed: %v", err)
	}

	return nil
}

// UpsertScalingPolicies is used to insert a new scaling policy.
func (s *StateStore) UpsertScalingPolicies(index uint64, scalingPolicies []*structs.ScalingPolicy) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	if err := s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn); err != nil {
		return err
	}

	return txn.Commit()
}

// UpsertScalingPoliciesTxn is used to insert a new scaling policy.
func (s *StateStore) UpsertScalingPoliciesTxn(index uint64, scalingPolicies []*structs.ScalingPolicy,
	txn *txn) error {

	hadUpdates := false

	for _, policy := range scalingPolicies {
		// Check if the scaling policy already exists
		// Policy uniqueness is based on target and type
		it, err := txn.Get("scaling_policy", "target",
			policy.Target[structs.ScalingTargetNamespace],
			policy.Target[structs.ScalingTargetJob],
			policy.Target[structs.ScalingTargetGroup],
			policy.Target[structs.ScalingTargetTask],
		)
		if err != nil {
			return fmt.Errorf("scaling policy lookup failed: %v", err)
		}

		// Check if type matches
		var existing *structs.ScalingPolicy
		for raw := it.Next(); raw != nil; raw = it.Next() {
			p := raw.(*structs.ScalingPolicy)
			if p.Type == policy.Type {
				existing = p
				break
			}
		}

		// Setup the indexes correctly
		if existing != nil {
			if !existing.Diff(policy) {
				continue
			}
			policy.ID = existing.ID
			policy.CreateIndex = existing.CreateIndex
		} else {
			// policy.ID must have been set already in Job.Register before log apply
			policy.CreateIndex = index
		}
		policy.ModifyIndex = index

		// Insert the scaling policy
		hadUpdates = true
		if err := txn.Insert("scaling_policy", policy); err != nil {
			return err
		}
	}

	// Update the indexes table for scaling policy if we updated any policies
	if hadUpdates {
		if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
			return fmt.Errorf("index update failed: %v", err)
		}
	}

	return nil
}

// NamespaceByName is used to lookup a namespace by name
func (s *StateStore) NamespaceByName(ws memdb.WatchSet, name string) (*structs.Namespace, error) {
	txn := s.db.ReadTxn()
	return s.namespaceByNameImpl(ws, txn, name)
}

// namespaceByNameImpl is used to lookup a namespace by name
func (s *StateStore) namespaceByNameImpl(ws memdb.WatchSet, txn *txn, name string) (*structs.Namespace, error) {
	watchCh, existing, err := txn.FirstWatch(TableNamespaces, "id", name)
	if err != nil {
		return nil, fmt.Errorf("namespace lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.Namespace), nil
	}
	return nil, nil
}

// namespaceExists returns whether a namespace exists
func (s *StateStore) namespaceExists(txn *txn, namespace string) (bool, error) {
	if namespace == structs.DefaultNamespace {
		return true, nil
	}

	existing, err := txn.First(TableNamespaces, "id", namespace)
	if err != nil {
		return false, fmt.Errorf("namespace lookup failed: %v", err)
	}

	return existing != nil, nil
}

// NamespacesByNamePrefix is used to lookup namespaces by prefix
func (s *StateStore) NamespacesByNamePrefix(ws memdb.WatchSet, namePrefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get(TableNamespaces, "id_prefix", namePrefix)
	if err != nil {
		return nil, fmt.Errorf("namespaces lookup failed: %v", err)
	}
	ws.Add(iter.WatchCh())

	return iter, nil
}

// Namespaces returns an iterator over all the namespaces
func (s *StateStore) Namespaces(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire namespace table
	iter, err := txn.Get(TableNamespaces, "id")
	if err != nil {
		return nil, err
	}
	ws.Add(iter.WatchCh())
	return iter, nil
}

func (s *StateStore) NamespaceNames() ([]string, error) {
	it, err := s.Namespaces(nil)
	if err != nil {
		return nil, err
	}

	nses := []string{}
	for {
		next := it.Next()
		if next == nil {
			break
		}
		ns := next.(*structs.Namespace)
		nses = append(nses, ns.Name)
	}

	return nses, nil
}

// UpsertNamespaces is used to register or update a set of namespaces.
func (s *StateStore) UpsertNamespaces(index uint64, namespaces []*structs.Namespace) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	for _, ns := range namespaces {
		if err := s.upsertNamespaceImpl(index, txn, ns); err != nil {
			return err
		}
	}

	if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// upsertNamespaceImpl is used to upsert a namespace
func (s *StateStore) upsertNamespaceImpl(index uint64, txn *txn, namespace *structs.Namespace) error {
	// Ensure the namespace hash is non-nil. This should be done outside the state store
	// for performance reasons, but we check here for defense in depth.
	ns := namespace
	if len(ns.Hash) == 0 {
		ns.SetHash()
	}

	// Check if the namespace already exists
	existing, err := txn.First(TableNamespaces, "id", ns.Name)
	if err != nil {
		return fmt.Errorf("namespace lookup failed: %v", err)
	}

	// Setup the indexes correctly and determine which quotas need to be
	// reconciled
	var oldQuota string
	if existing != nil {
		exist := existing.(*structs.Namespace)
		ns.CreateIndex = exist.CreateIndex
		ns.ModifyIndex = index

		// Grab the old quota on the namespace
		oldQuota = exist.Quota
	} else {
		ns.CreateIndex = index
		ns.ModifyIndex = index
	}

	// Validate that the quota on the new namespace exists
	if ns.Quota != "" {
		exists, err := s.quotaSpecExists(txn, ns.Quota)
		if err != nil {
			return fmt.Errorf("looking up namespace quota %q failed: %v", ns.Quota, err)
		} else if !exists {
			return fmt.Errorf("namespace %q using non-existent quota %q", ns.Name, ns.Quota)
		}
	}

	// Insert the namespace
	if err := txn.Insert(TableNamespaces, ns); err != nil {
		return fmt.Errorf("namespace insert failed: %v", err)
	}

	// Reconcile changed quotas
	return s.quotaReconcile(index, txn, ns.Quota, oldQuota)
}

// DeleteNamespaces is used to remove a set of namespaces
func (s *StateStore) DeleteNamespaces(index uint64, names []string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	for _, name := range names {
		// Lookup the namespace
		existing, err := txn.First(TableNamespaces, "id", name)
		if err != nil {
			return fmt.Errorf("namespace lookup failed: %v", err)
		}
		if existing == nil {
			return fmt.Errorf("namespace not found")
		}

		ns := existing.(*structs.Namespace)
		if ns.Name == structs.DefaultNamespace {
			return fmt.Errorf("default namespace can not be deleted")
		}

		// Ensure that the namespace doesn't have any non-terminal jobs
		iter, err := s.jobsByNamespaceImpl(nil, name, txn)
		if err != nil {
			return err
		}

		for {
			raw := iter.Next()
			if raw == nil {
				break
			}
			job := raw.(*structs.Job)

			if job.Status != structs.JobStatusDead {
				return fmt.Errorf("namespace %q contains at least one non-terminal job %q. "+
					"All jobs must be terminal in namespace before it can be deleted", name, job.ID)
			}
		}

		vIter, err := s.csiVolumesByNamespaceImpl(txn, nil, name, "")
		if err != nil {
			return err
		}
		rawVol := vIter.Next()
		if rawVol != nil {
			vol := rawVol.(*structs.CSIVolume)
			return fmt.Errorf("namespace %q contains at least one CSI volume %q. "+
				"All CSI volumes in namespace must be deleted before it can be deleted", name, vol.ID)
		}

		varIter, err := s.getVariablesByNamespaceImpl(txn, nil, name)
		if err != nil {
			return err
		}
		if varIter.Next() != nil {
			// unlike job/volume, don't show the path here because the user may
			// not have List permissions on the vars in this namespace
			return fmt.Errorf("namespace %q contains at least one variable. "+
				"All variables in namespace must be deleted before it can be deleted", name)
		}

		// Delete the namespace
		if err := txn.Delete(TableNamespaces, existing); err != nil {
			return fmt.Errorf("namespace deletion failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

func (s *StateStore) DeleteScalingPolicies(index uint64, ids []string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	err := s.DeleteScalingPoliciesTxn(index, ids, txn)
	if err == nil {
		return txn.Commit()
	}

	return err
}

// DeleteScalingPoliciesTxn is used to delete a set of scaling policies by ID.
func (s *StateStore) DeleteScalingPoliciesTxn(index uint64, ids []string, txn *txn) error {
	if len(ids) == 0 {
		return nil
	}

	for _, id := range ids {
		// Lookup the scaling policy
		existing, err := txn.First("scaling_policy", "id", id)
		if err != nil {
			return fmt.Errorf("scaling policy lookup failed: %v", err)
		}
		if existing == nil {
			return fmt.Errorf("scaling policy not found")
		}

		// Delete the scaling policy
		if err := txn.Delete("scaling_policy", existing); err != nil {
			return fmt.Errorf("scaling policy delete failed: %v", err)
		}
	}

	if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return nil
}

// ScalingPolicies returns an iterator over all the scaling policies
func (s *StateStore) ScalingPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	// Walk the entire scaling_policy table
	iter, err := txn.Get("scaling_policy", "id")
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	return iter, nil
}

// ScalingPoliciesByTypePrefix returns an iterator over scaling policies with a certain type prefix.
func (s *StateStore) ScalingPoliciesByTypePrefix(ws memdb.WatchSet, t string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("scaling_policy", "type_prefix", t)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

func (s *StateStore) ScalingPoliciesByNamespace(ws memdb.WatchSet, namespace, typ string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("scaling_policy", "target_prefix", namespace)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	// Wrap the iterator in a filter to exact match the namespace
	iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace))

	// If policy type is specified as well, wrap again
	if typ != "" {
		iter = memdb.NewFilterIterator(iter, func(raw interface{}) bool {
			p, ok := raw.(*structs.ScalingPolicy)
			if !ok {
				return true
			}
			return !strings.HasPrefix(p.Type, typ)
		})
	}

	return iter, nil
}

func (s *StateStore) ScalingPoliciesByJob(ws memdb.WatchSet, namespace, jobID, policyType string) (memdb.ResultIterator,
	error) {
	txn := s.db.ReadTxn()
	iter, err := s.ScalingPoliciesByJobTxn(ws, namespace, jobID, txn)
	if err != nil {
		return nil, err
	}

	if policyType == "" {
		return iter, nil
	}

	filter := func(raw interface{}) bool {
		p, ok := raw.(*structs.ScalingPolicy)
		if !ok {
			return true
		}
		return policyType != p.Type
	}

	return memdb.NewFilterIterator(iter, filter), nil
}

func (s *StateStore) ScalingPoliciesByJobTxn(ws memdb.WatchSet, namespace, jobID string,
	txn *txn) (memdb.ResultIterator, error) {

	iter, err := txn.Get("scaling_policy", "target_prefix", namespace, jobID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())

	filter := func(raw interface{}) bool {
		d, ok := raw.(*structs.ScalingPolicy)
		if !ok {
			return true
		}

		return d.Target[structs.ScalingTargetJob] != jobID
	}

	// Wrap the iterator in a filter
	wrap := memdb.NewFilterIterator(iter, filter)
	return wrap, nil
}

func (s *StateStore) ScalingPolicyByID(ws memdb.WatchSet, id string) (*structs.ScalingPolicy, error) {
	txn := s.db.ReadTxn()

	watchCh, existing, err := txn.FirstWatch("scaling_policy", "id", id)
	if err != nil {
		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if existing != nil {
		return existing.(*structs.ScalingPolicy), nil
	}

	return nil, nil
}

// ScalingPolicyByTargetAndType returns a fully-qualified policy against a target and policy type,
// or nil if it does not exist. This method does not honor the watchset on the policy type, just the target.
func (s *StateStore) ScalingPolicyByTargetAndType(ws memdb.WatchSet, target map[string]string, typ string) (*structs.ScalingPolicy,
	error) {
	txn := s.db.ReadTxn()

	namespace := target[structs.ScalingTargetNamespace]
	job := target[structs.ScalingTargetJob]
	group := target[structs.ScalingTargetGroup]
	task := target[structs.ScalingTargetTask]

	it, err := txn.Get("scaling_policy", "target", namespace, job, group, task)
	if err != nil {
		return nil, fmt.Errorf("scaling_policy lookup failed: %v", err)
	}

	ws.Add(it.WatchCh())

	// Check for type
	var existing *structs.ScalingPolicy
	for raw := it.Next(); raw != nil; raw = it.Next() {
		p := raw.(*structs.ScalingPolicy)
		if p.Type == typ {
			existing = p
			break
		}
	}

	if existing != nil {
		return existing, nil
	}

	return nil, nil
}

func (s *StateStore) ScalingPoliciesByIDPrefix(ws memdb.WatchSet, namespace string, prefix string) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get("scaling_policy", "id_prefix", prefix)
	if err != nil {
		return nil, fmt.Errorf("scaling policy lookup failed: %v", err)
	}

	ws.Add(iter.WatchCh())

	iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace))

	return iter, nil
}

// scalingPolicyNamespaceFilter returns a filter function that filters all
// scaling policies not targeting the given namespace.
func scalingPolicyNamespaceFilter(namespace string) func(interface{}) bool {
	return func(raw interface{}) bool {
		p, ok := raw.(*structs.ScalingPolicy)
		if !ok {
			return true
		}

		return p.Target[structs.ScalingTargetNamespace] != namespace
	}
}

// StateSnapshot is used to provide a point-in-time snapshot
type StateSnapshot struct {
	StateStore
}

// DenormalizeAllocationsMap takes in a map of nodes to allocations, and queries the
// Allocation for each of the Allocation diffs and merges the updated attributes with
// the existing Allocation, and attaches the Job provided
func (s *StateSnapshot) DenormalizeAllocationsMap(nodeAllocations map[string][]*structs.Allocation) error {
	for nodeID, allocs := range nodeAllocations {
		denormalizedAllocs, err := s.DenormalizeAllocationSlice(allocs)
		if err != nil {
			return err
		}

		nodeAllocations[nodeID] = denormalizedAllocs
	}
	return nil
}

// DenormalizeAllocationSlice queries the Allocation for each allocation diff
// represented as an Allocation and merges the updated attributes with the existing
// Allocation, and attaches the Job provided.
//
// This should only be called on terminal allocs, particularly stopped or preempted allocs
func (s *StateSnapshot) DenormalizeAllocationSlice(allocs []*structs.Allocation) ([]*structs.Allocation, error) {
	allocDiffs := make([]*structs.AllocationDiff, len(allocs))
	for i, alloc := range allocs {
		allocDiffs[i] = alloc.AllocationDiff()
	}

	return s.DenormalizeAllocationDiffSlice(allocDiffs)
}

// DenormalizeAllocationDiffSlice queries the Allocation for each AllocationDiff and merges
// the updated attributes with the existing Allocation, and attaches the Job provided.
//
// This should only be called on terminal alloc, particularly stopped or preempted allocs
func (s *StateSnapshot) DenormalizeAllocationDiffSlice(allocDiffs []*structs.AllocationDiff) ([]*structs.Allocation, error) {
	// Output index for denormalized Allocations
	j := 0

	denormalizedAllocs := make([]*structs.Allocation, len(allocDiffs))
	for _, allocDiff := range allocDiffs {
		alloc, err := s.AllocByID(nil, allocDiff.ID)
		if err != nil {
			return nil, fmt.Errorf("alloc lookup failed: %v", err)
		}
		if alloc == nil {
			return nil, fmt.Errorf("alloc %v doesn't exist", allocDiff.ID)
		}

		// Merge the updates to the Allocation.  Don't update alloc.Job for terminal allocs
		// so alloc refers to the latest Job view before destruction and to ease handler implementations
		allocCopy := alloc.Copy()

		if allocDiff.PreemptedByAllocation != "" {
			allocCopy.PreemptedByAllocation = allocDiff.PreemptedByAllocation
			allocCopy.DesiredDescription = getPreemptedAllocDesiredDescription(allocDiff.PreemptedByAllocation)
			allocCopy.DesiredStatus = structs.AllocDesiredStatusEvict
		} else {
			// If alloc is a stopped alloc
			allocCopy.DesiredDescription = allocDiff.DesiredDescription
			allocCopy.DesiredStatus = structs.AllocDesiredStatusStop
			if allocDiff.ClientStatus != "" {
				allocCopy.ClientStatus = allocDiff.ClientStatus
			}
			if allocDiff.FollowupEvalID != "" {
				allocCopy.FollowupEvalID = allocDiff.FollowupEvalID
			}
		}
		if allocDiff.ModifyTime != 0 {
			allocCopy.ModifyTime = allocDiff.ModifyTime
		}

		// Update the allocDiff in the slice to equal the denormalized alloc
		denormalizedAllocs[j] = allocCopy
		j++
	}
	// Retain only the denormalized Allocations in the slice
	denormalizedAllocs = denormalizedAllocs[:j]
	return denormalizedAllocs, nil
}

func getPreemptedAllocDesiredDescription(preemptedByAllocID string) string {
	return fmt.Sprintf("Preempted by alloc ID %v", preemptedByAllocID)
}

// UpsertRootKeyMeta saves root key meta or updates it in-place.
func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKeyMeta, rekey bool) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// get any existing key for updating
	raw, err := txn.First(TableRootKeyMeta, indexID, rootKeyMeta.KeyID)
	if err != nil {
		return fmt.Errorf("root key metadata lookup failed: %v", err)
	}

	isRotation := false

	if raw != nil {
		existing := raw.(*structs.RootKeyMeta)
		rootKeyMeta.CreateIndex = existing.CreateIndex
		rootKeyMeta.CreateTime = existing.CreateTime
		isRotation = !existing.Active() && rootKeyMeta.Active()
	} else {
		rootKeyMeta.CreateIndex = index
		isRotation = rootKeyMeta.Active()
	}
	rootKeyMeta.ModifyIndex = index

	if rekey && !isRotation {
		return fmt.Errorf("cannot rekey without setting the new key active")
	}

	// if the upsert is for a newly-active key, we need to set all the
	// other keys as inactive in the same transaction.
	if isRotation {
		iter, err := txn.Get(TableRootKeyMeta, indexID)
		if err != nil {
			return err
		}
		for {
			raw := iter.Next()
			if raw == nil {
				break
			}
			key := raw.(*structs.RootKeyMeta)
			modified := false

			switch key.State {
			case structs.RootKeyStateInactive:
				if rekey {
					key.SetRekeying()
					modified = true
				}
			case structs.RootKeyStateActive:
				if rekey {
					key.SetRekeying()
				} else {
					key.SetInactive()
				}
				modified = true
			case structs.RootKeyStateRekeying, structs.RootKeyStateDeprecated:
				// nothing to do
			}

			if modified {
				key.ModifyIndex = index
				if err := txn.Insert(TableRootKeyMeta, key); err != nil {
					return err
				}
			}

		}
	}

	if err := txn.Insert(TableRootKeyMeta, rootKeyMeta); err != nil {
		return err
	}

	// update the indexes table
	if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}
	return txn.Commit()
}

// DeleteRootKeyMeta deletes a single root key, or returns an error if
// it doesn't exist.
func (s *StateStore) DeleteRootKeyMeta(index uint64, keyID string) error {
	txn := s.db.WriteTxn(index)
	defer txn.Abort()

	// find the old key
	existing, err := txn.First(TableRootKeyMeta, indexID, keyID)
	if err != nil {
		return fmt.Errorf("root key metadata lookup failed: %v", err)
	}
	if existing == nil {
		return fmt.Errorf("root key metadata not found")
	}
	if err := txn.Delete(TableRootKeyMeta, existing); err != nil {
		return fmt.Errorf("root key metadata delete failed: %v", err)
	}

	// update the indexes table
	if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil {
		return fmt.Errorf("index update failed: %v", err)
	}

	return txn.Commit()
}

// RootKeyMetas returns an iterator over all root key metadata
func (s *StateStore) RootKeyMetas(ws memdb.WatchSet) (memdb.ResultIterator, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get(TableRootKeyMeta, indexID)
	if err != nil {
		return nil, err
	}

	ws.Add(iter.WatchCh())
	return iter, nil
}

// RootKeyMetaByID returns a specific root key meta
func (s *StateStore) RootKeyMetaByID(ws memdb.WatchSet, id string) (*structs.RootKeyMeta, error) {
	txn := s.db.ReadTxn()

	watchCh, raw, err := txn.FirstWatch(TableRootKeyMeta, indexID, id)
	if err != nil {
		return nil, fmt.Errorf("root key metadata lookup failed: %v", err)
	}
	ws.Add(watchCh)

	if raw != nil {
		return raw.(*structs.RootKeyMeta), nil
	}
	return nil, nil
}

// GetActiveRootKeyMeta returns the metadata for the currently active root key
func (s *StateStore) GetActiveRootKeyMeta(ws memdb.WatchSet) (*structs.RootKeyMeta, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get(TableRootKeyMeta, indexID)
	if err != nil {
		return nil, err
	}
	ws.Add(iter.WatchCh())

	for {
		raw := iter.Next()
		if raw == nil {
			break
		}
		key := raw.(*structs.RootKeyMeta)
		if key.Active() {
			return key, nil
		}
	}
	return nil, nil
}

// IsRootKeyMetaInUse determines whether a key has been used to sign a workload
// identity for a live allocation or encrypt any variables
func (s *StateStore) IsRootKeyMetaInUse(keyID string) (bool, error) {
	txn := s.db.ReadTxn()

	iter, err := txn.Get(TableAllocs, indexSigningKey, keyID, true)
	if err != nil {
		return false, err
	}
	alloc := iter.Next()
	if alloc != nil {
		return true, nil
	}

	iter, err = txn.Get(TableVariables, indexKeyID, keyID)
	if err != nil {
		return false, err
	}
	variable := iter.Next()
	if variable != nil {
		return true, nil
	}

	return false, nil
}