package state import ( "context" "errors" "fmt" "reflect" "sort" "strings" "time" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/nomad/stream" "github.com/hashicorp/nomad/nomad/structs" ) // Txn is a transaction against a state store. // This can be a read or write transaction. type Txn = *txn // SortOption represents how results can be sorted. type SortOption bool const ( // SortDefault indicates that the result should be returned using the // default go-memdb ResultIterator order. SortDefault SortOption = false // SortReverse indicates that the result should be returned using the // reversed go-memdb ResultIterator order. SortReverse SortOption = true ) const ( // NodeEligibilityEventPlanRejectThreshold is the message used when the node // is set to ineligible due to multiple plan failures. // This is a preventive measure to signal scheduler workers to not consider // the node for future placements. // Plan rejections for a node are expected due to the optimistic and // concurrent nature of the scheduling process, but repeated failures for // the same node may indicate an underlying issue not detected by Nomad. // The plan applier keeps track of plan rejection history and will mark // nodes as ineligible if they cross a given threshold. NodeEligibilityEventPlanRejectThreshold = "Node marked as ineligible for scheduling due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information" // NodeRegisterEventRegistered is the message used when the node becomes // registered. NodeRegisterEventRegistered = "Node registered" // NodeRegisterEventReregistered is the message used when the node becomes // re-registered. NodeRegisterEventReregistered = "Node re-registered" ) // terminate appends the go-memdb terminator character to s. // // We can then use the result for exact matches during prefix // scans over compound indexes that start with s. func terminate(s string) string { return s + "\x00" } // IndexEntry is used with the "index" table // for managing the latest Raft index affecting a table. type IndexEntry struct { Key string Value uint64 } // StateStoreConfig is used to configure a new state store type StateStoreConfig struct { // Logger is used to output the state store's logs Logger hclog.Logger // Region is the region of the server embedding the state store. Region string // EnablePublisher is used to enable or disable the event publisher EnablePublisher bool // EventBufferSize configures the amount of events to hold in memory EventBufferSize int64 } // The StateStore is responsible for maintaining all the Nomad // state. It is manipulated by the FSM which maintains consistency // through the use of Raft. The goals of the StateStore are to provide // high concurrency for read operations without blocking writes, and // to provide write availability in the face of reads. EVERY object // returned as a result of a read against the state store should be // considered a constant and NEVER modified in place. type StateStore struct { logger hclog.Logger db *changeTrackerDB // config is the passed in configuration config *StateStoreConfig // abandonCh is used to signal watchers that this state store has been // abandoned (usually during a restore). This is only ever closed. abandonCh chan struct{} // TODO: refactor abandonCh to use a context so that both can use the same // cancel mechanism. stopEventBroker func() } type streamACLDelegate struct { s *StateStore } func (a *streamACLDelegate) TokenProvider() stream.ACLTokenProvider { resolver, _ := a.s.Snapshot() return resolver } // NewStateStore is used to create a new state store func NewStateStore(config *StateStoreConfig) (*StateStore, error) { // Create the MemDB db, err := memdb.NewMemDB(stateStoreSchema()) if err != nil { return nil, fmt.Errorf("state store setup failed: %v", err) } // Create the state store ctx, cancel := context.WithCancel(context.TODO()) s := &StateStore{ logger: config.Logger.Named("state_store"), config: config, abandonCh: make(chan struct{}), stopEventBroker: cancel, } if config.EnablePublisher { // Create new event publisher using provided config broker, err := stream.NewEventBroker(ctx, &streamACLDelegate{s}, stream.EventBrokerCfg{ EventBufferSize: config.EventBufferSize, Logger: config.Logger, }) if err != nil { return nil, fmt.Errorf("creating state store event broker %w", err) } s.db = NewChangeTrackerDB(db, broker, eventsFromChanges) } else { s.db = NewChangeTrackerDB(db, nil, noOpProcessChanges) } // Initialize the state store with the default namespace. if err := s.namespaceInit(); err != nil { return nil, fmt.Errorf("enterprise state store initialization failed: %v", err) } return s, nil } // NewWatchSet returns a new memdb.WatchSet that adds the state stores abandonCh // as a watcher. This is important in that it will notify when this specific // state store is no longer valid, usually due to a new snapshot being loaded func (s *StateStore) NewWatchSet() memdb.WatchSet { ws := memdb.NewWatchSet() ws.Add(s.AbandonCh()) return ws } func (s *StateStore) EventBroker() (*stream.EventBroker, error) { if s.db.publisher == nil { return nil, fmt.Errorf("EventBroker not configured") } return s.db.publisher, nil } // namespaceInit ensures the default namespace exists. func (s *StateStore) namespaceInit() error { // Create the default namespace. This is safe to do every time we create the // state store. There are two main cases, a brand new cluster in which case // each server will have the same default namespace object, or a new cluster // in which case if the default namespace has been modified, it will be // overridden by the restore code path. defaultNs := &structs.Namespace{ Name: structs.DefaultNamespace, Description: structs.DefaultNamespaceDescription, } if err := s.UpsertNamespaces(1, []*structs.Namespace{defaultNs}); err != nil { return fmt.Errorf("inserting default namespace failed: %v", err) } return nil } // Config returns the state store configuration. func (s *StateStore) Config() *StateStoreConfig { return s.config } // Snapshot is used to create a point in time snapshot. Because // we use MemDB, we just need to snapshot the state of the underlying // database. func (s *StateStore) Snapshot() (*StateSnapshot, error) { memDBSnap := s.db.memdb.Snapshot() store := StateStore{ logger: s.logger, config: s.config, } // Create a new change tracker DB that does not publish or track changes store.db = NewChangeTrackerDB(memDBSnap, nil, noOpProcessChanges) snap := &StateSnapshot{ StateStore: store, } return snap, nil } // SnapshotMinIndex is used to create a state snapshot where the index is // guaranteed to be greater than or equal to the index parameter. // // Some server operations (such as scheduling) exchange objects via RPC // concurrent with Raft log application, so they must ensure the state store // snapshot they are operating on is at or after the index the objects // retrieved via RPC were applied to the Raft log at. // // Callers should maintain their own timer metric as the time this method // blocks indicates Raft log application latency relative to scheduling. func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*StateSnapshot, error) { // Ported from work.go:waitForIndex prior to 0.9 const backoffBase = 20 * time.Millisecond const backoffLimit = 1 * time.Second var retries uint var retryTimer *time.Timer // XXX: Potential optimization is to set up a watch on the state // store's index table and only unblock via a trigger rather than // polling. for { // Get the states current index snapshotIndex, err := s.LatestIndex() if err != nil { return nil, fmt.Errorf("failed to determine state store's index: %v", err) } // We only need the FSM state to be as recent as the given index if snapshotIndex >= index { return s.Snapshot() } // Exponential back off retries++ if retryTimer == nil { // First retry, start at baseline retryTimer = time.NewTimer(backoffBase) } else { // Subsequent retry, reset timer deadline := 1 << (2 * retries) * backoffBase if deadline > backoffLimit { deadline = backoffLimit } retryTimer.Reset(deadline) } select { case <-ctx.Done(): return nil, ctx.Err() case <-retryTimer.C: } } } // Restore is used to optimize the efficiency of rebuilding // state by minimizing the number of transactions and checking // overhead. func (s *StateStore) Restore() (*StateRestore, error) { txn := s.db.WriteTxnRestore() r := &StateRestore{ txn: txn, } return r, nil } // AbandonCh returns a channel you can wait on to know if the state store was // abandoned. func (s *StateStore) AbandonCh() <-chan struct{} { return s.abandonCh } // Abandon is used to signal that the given state store has been abandoned. // Calling this more than one time will panic. func (s *StateStore) Abandon() { s.StopEventBroker() close(s.abandonCh) } // StopEventBroker calls the cancel func for the state stores event // publisher. It should be called during server shutdown. func (s *StateStore) StopEventBroker() { s.stopEventBroker() } // QueryFn is the definition of a function that can be used to implement a basic // blocking query against the state store. type QueryFn func(memdb.WatchSet, *StateStore) (resp interface{}, index uint64, err error) // BlockingQuery takes a query function and runs the function until the minimum // query index is met or until the passed context is cancelled. func (s *StateStore) BlockingQuery(query QueryFn, minIndex uint64, ctx context.Context) ( resp interface{}, index uint64, err error) { RUN_QUERY: // We capture the state store and its abandon channel but pass a snapshot to // the blocking query function. We operate on the snapshot to allow separate // calls to the state store not all wrapped within the same transaction. abandonCh := s.AbandonCh() snap, _ := s.Snapshot() stateSnap := &snap.StateStore // We can skip all watch tracking if this isn't a blocking query. var ws memdb.WatchSet if minIndex > 0 { ws = memdb.NewWatchSet() // This channel will be closed if a snapshot is restored and the // whole state store is abandoned. ws.Add(abandonCh) } resp, index, err = query(ws, stateSnap) if err != nil { return nil, index, err } // We haven't reached the min-index yet. if minIndex > 0 && index <= minIndex { if err := ws.WatchCtx(ctx); err != nil { return nil, index, err } goto RUN_QUERY } return resp, index, nil } // UpsertPlanResults is used to upsert the results of a plan. func (s *StateStore) UpsertPlanResults(msgType structs.MessageType, index uint64, results *structs.ApplyPlanResultsRequest) error { snapshot, err := s.Snapshot() if err != nil { return err } allocsStopped, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsStopped) if err != nil { return err } allocsPreempted, err := snapshot.DenormalizeAllocationDiffSlice(results.AllocsPreempted) if err != nil { return err } // COMPAT 0.11: Remove this denormalization when NodePreemptions is removed results.NodePreemptions, err = snapshot.DenormalizeAllocationSlice(results.NodePreemptions) if err != nil { return err } txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Mark nodes as ineligible. for _, nodeID := range results.IneligibleNodes { s.logger.Warn("marking node as ineligible due to multiple plan rejections, refer to https://www.nomadproject.io/s/port-plan-failure for more information", "node_id", nodeID) nodeEvent := structs.NewNodeEvent(). SetSubsystem(structs.NodeEventSubsystemScheduler). SetMessage(NodeEligibilityEventPlanRejectThreshold) err := s.updateNodeEligibilityImpl(index, nodeID, structs.NodeSchedulingIneligible, results.UpdatedAt, nodeEvent, txn) if err != nil { return err } } // Upsert the newly created or updated deployment if results.Deployment != nil { if err := s.upsertDeploymentImpl(index, results.Deployment, txn); err != nil { return err } } // Update the status of deployments effected by the plan. if len(results.DeploymentUpdates) != 0 { s.upsertDeploymentUpdates(index, results.DeploymentUpdates, txn) } if results.EvalID != "" { // Update the modify index of the eval id if err := s.updateEvalModifyIndex(txn, index, results.EvalID); err != nil { return err } } numAllocs := 0 if len(results.Alloc) > 0 || len(results.NodePreemptions) > 0 { // COMPAT 0.11: This branch will be removed, when Alloc is removed // Attach the job to all the allocations. It is pulled out in the payload to // avoid the redundancy of encoding, but should be denormalized prior to // being inserted into MemDB. addComputedAllocAttrs(results.Alloc, results.Job) numAllocs = len(results.Alloc) + len(results.NodePreemptions) } else { // Attach the job to all the allocations. It is pulled out in the payload to // avoid the redundancy of encoding, but should be denormalized prior to // being inserted into MemDB. addComputedAllocAttrs(results.AllocsUpdated, results.Job) numAllocs = len(allocsStopped) + len(results.AllocsUpdated) + len(allocsPreempted) } allocsToUpsert := make([]*structs.Allocation, 0, numAllocs) // COMPAT 0.11: Both these appends should be removed when Alloc and NodePreemptions are removed allocsToUpsert = append(allocsToUpsert, results.Alloc...) allocsToUpsert = append(allocsToUpsert, results.NodePreemptions...) allocsToUpsert = append(allocsToUpsert, allocsStopped...) allocsToUpsert = append(allocsToUpsert, results.AllocsUpdated...) allocsToUpsert = append(allocsToUpsert, allocsPreempted...) // handle upgrade path for _, alloc := range allocsToUpsert { alloc.Canonicalize() } if err := s.upsertAllocsImpl(index, allocsToUpsert, txn); err != nil { return err } // Upsert followup evals for allocs that were preempted for _, eval := range results.PreemptionEvals { if err := s.nestedUpsertEval(txn, index, eval); err != nil { return err } } return txn.Commit() } // addComputedAllocAttrs adds the computed/derived attributes to the allocation. // This method is used when an allocation is being denormalized. func addComputedAllocAttrs(allocs []*structs.Allocation, job *structs.Job) { structs.DenormalizeAllocationJobs(job, allocs) // COMPAT(0.11): Remove in 0.11 // Calculate the total resources of allocations. It is pulled out in the // payload to avoid encoding something that can be computed, but should be // denormalized prior to being inserted into MemDB. for _, alloc := range allocs { if alloc.Resources != nil { continue } alloc.Resources = new(structs.Resources) for _, task := range alloc.TaskResources { alloc.Resources.Add(task) } // Add the shared resources alloc.Resources.Add(alloc.SharedResources) } } // upsertDeploymentUpdates updates the deployments given the passed status // updates. func (s *StateStore) upsertDeploymentUpdates(index uint64, updates []*structs.DeploymentStatusUpdate, txn *txn) error { for _, u := range updates { if err := s.updateDeploymentStatusImpl(index, u, txn); err != nil { return err } } return nil } // UpsertJobSummary upserts a job summary into the state store. func (s *StateStore) UpsertJobSummary(index uint64, jobSummary *structs.JobSummary) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Check if the job summary already exists existing, err := txn.First("job_summary", "id", jobSummary.Namespace, jobSummary.JobID) if err != nil { return fmt.Errorf("job summary lookup failed: %v", err) } // Setup the indexes correctly if existing != nil { jobSummary.CreateIndex = existing.(*structs.JobSummary).CreateIndex jobSummary.ModifyIndex = index } else { jobSummary.CreateIndex = index jobSummary.ModifyIndex = index } // Update the index if err := txn.Insert("job_summary", jobSummary); err != nil { return err } // Update the indexes table for job summary if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteJobSummary deletes the job summary with the given ID. This is for // testing purposes only. func (s *StateStore) DeleteJobSummary(index uint64, namespace, id string) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Delete the job summary if _, err := txn.DeleteAll("job_summary", "id", namespace, id); err != nil { return fmt.Errorf("deleting job summary failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // UpsertDeployment is used to insert a new deployment. If cancelPrior is set to // true, all prior deployments for the same job will be cancelled. func (s *StateStore) UpsertDeployment(index uint64, deployment *structs.Deployment) error { txn := s.db.WriteTxn(index) defer txn.Abort() if err := s.upsertDeploymentImpl(index, deployment, txn); err != nil { return err } return txn.Commit() } func (s *StateStore) upsertDeploymentImpl(index uint64, deployment *structs.Deployment, txn *txn) error { // Check if the deployment already exists existing, err := txn.First("deployment", "id", deployment.ID) if err != nil { return fmt.Errorf("deployment lookup failed: %v", err) } // Setup the indexes correctly if existing != nil { deployment.CreateIndex = existing.(*structs.Deployment).CreateIndex deployment.ModifyIndex = index } else { deployment.CreateIndex = index deployment.ModifyIndex = index } // Insert the deployment if err := txn.Insert("deployment", deployment); err != nil { return err } // Update the indexes table for deployment if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // If the deployment is being marked as complete, set the job to stable. if deployment.Status == structs.DeploymentStatusSuccessful { if err := s.updateJobStabilityImpl(index, deployment.Namespace, deployment.JobID, deployment.JobVersion, true, txn); err != nil { return fmt.Errorf("failed to update job stability: %v", err) } } return nil } func (s *StateStore) Deployments(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var it memdb.ResultIterator var err error switch sort { case SortReverse: it, err = txn.GetReverse("deployment", "create") default: it, err = txn.Get("deployment", "create") } if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } func (s *StateStore) DeploymentsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire deployments table iter, err := txn.Get("deployment", "namespace", namespace) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } func (s *StateStore) DeploymentsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var ( it memdb.ResultIterator err error exact = terminate(namespace) ) switch sort { case SortReverse: it, err = txn.GetReverse("deployment", "namespace_create_prefix", exact) default: it, err = txn.Get("deployment", "namespace_create_prefix", exact) } if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } func (s *StateStore) DeploymentsByIDPrefix(ws memdb.WatchSet, namespace, deploymentID string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var iter memdb.ResultIterator var err error // Walk the entire deployments table switch sort { case SortReverse: iter, err = txn.GetReverse("deployment", "id_prefix", deploymentID) default: iter, err = txn.Get("deployment", "id_prefix", deploymentID) } if err != nil { return nil, err } ws.Add(iter.WatchCh()) // Wrap the iterator in a filter wrap := memdb.NewFilterIterator(iter, deploymentNamespaceFilter(namespace)) return wrap, nil } // deploymentNamespaceFilter returns a filter function that filters all // deployment not in the given namespace. func deploymentNamespaceFilter(namespace string) func(interface{}) bool { return func(raw interface{}) bool { d, ok := raw.(*structs.Deployment) if !ok { return true } return d.Namespace != namespace } } func (s *StateStore) DeploymentByID(ws memdb.WatchSet, deploymentID string) (*structs.Deployment, error) { txn := s.db.ReadTxn() return s.deploymentByIDImpl(ws, deploymentID, txn) } func (s *StateStore) deploymentByIDImpl(ws memdb.WatchSet, deploymentID string, txn *txn) (*structs.Deployment, error) { watchCh, existing, err := txn.FirstWatch("deployment", "id", deploymentID) if err != nil { return nil, fmt.Errorf("deployment lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.Deployment), nil } return nil, nil } func (s *StateStore) DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error) { txn := s.db.ReadTxn() var job *structs.Job // Read job from state store _, existing, err := txn.FirstWatch("jobs", "id", namespace, jobID) if err != nil { return nil, fmt.Errorf("job lookup failed: %v", err) } if existing != nil { job = existing.(*structs.Job) } // Get an iterator over the deployments iter, err := txn.Get("deployment", "job", namespace, jobID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Deployment for { raw := iter.Next() if raw == nil { break } d := raw.(*structs.Deployment) // If the allocation belongs to a job with the same ID but a different // create index and we are not getting all the allocations whose Jobs // matches the same Job ID then we skip it if !all && job != nil && d.JobCreateIndex != job.CreateIndex { continue } out = append(out, d) } return out, nil } // LatestDeploymentByJobID returns the latest deployment for the given job. The // latest is determined strictly by CreateIndex. func (s *StateStore) LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error) { txn := s.db.ReadTxn() // Get an iterator over the deployments iter, err := txn.Get("deployment", "job", namespace, jobID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out *structs.Deployment for { raw := iter.Next() if raw == nil { break } d := raw.(*structs.Deployment) if out == nil || out.CreateIndex < d.CreateIndex { out = d } } return out, nil } // DeleteDeployment is used to delete a set of deployments by ID func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) error { txn := s.db.WriteTxn(index) defer txn.Abort() if len(deploymentIDs) == 0 { return nil } for _, deploymentID := range deploymentIDs { // Lookup the deployment existing, err := txn.First("deployment", "id", deploymentID) if err != nil { return fmt.Errorf("deployment lookup failed: %v", err) } if existing == nil { return fmt.Errorf("deployment not found") } // Delete the deployment if err := txn.Delete("deployment", existing); err != nil { return fmt.Errorf("deployment delete failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // UpsertScalingEvent is used to insert a new scaling event. // Only the most recent JobTrackedScalingEvents will be kept. func (s *StateStore) UpsertScalingEvent(index uint64, req *structs.ScalingEventRequest) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Get the existing events existing, err := txn.First("scaling_event", "id", req.Namespace, req.JobID) if err != nil { return fmt.Errorf("scaling event lookup failed: %v", err) } var jobEvents *structs.JobScalingEvents if existing != nil { jobEvents = existing.(*structs.JobScalingEvents) } else { jobEvents = &structs.JobScalingEvents{ Namespace: req.Namespace, JobID: req.JobID, ScalingEvents: make(map[string][]*structs.ScalingEvent), } } jobEvents.ModifyIndex = index req.ScalingEvent.CreateIndex = index events := jobEvents.ScalingEvents[req.TaskGroup] // Prepend this latest event events = append( []*structs.ScalingEvent{req.ScalingEvent}, events..., ) // Truncate older events if len(events) > structs.JobTrackedScalingEvents { events = events[0:structs.JobTrackedScalingEvents] } jobEvents.ScalingEvents[req.TaskGroup] = events // Insert the new event if err := txn.Insert("scaling_event", jobEvents); err != nil { return fmt.Errorf("scaling event insert failed: %v", err) } // Update the indexes table for scaling_event if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // ScalingEvents returns an iterator over all the job scaling events func (s *StateStore) ScalingEvents(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire scaling_event table iter, err := txn.Get("scaling_event", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } func (s *StateStore) ScalingEventsByJob(ws memdb.WatchSet, namespace, jobID string) (map[string][]*structs.ScalingEvent, uint64, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("scaling_event", "id", namespace, jobID) if err != nil { return nil, 0, fmt.Errorf("job scaling events lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { events := existing.(*structs.JobScalingEvents) return events.ScalingEvents, events.ModifyIndex, nil } return nil, 0, nil } // UpsertNode is used to register a node or update a node definition // This is assumed to be triggered by the client, so we retain the value // of drain/eligibility which is set by the scheduler. func (s *StateStore) UpsertNode(msgType structs.MessageType, index uint64, node *structs.Node) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() err := upsertNodeTxn(txn, index, node) if err != nil { return nil } return txn.Commit() } func upsertNodeTxn(txn *txn, index uint64, node *structs.Node) error { // Check if the node already exists existing, err := txn.First("nodes", "id", node.ID) if err != nil { return fmt.Errorf("node lookup failed: %v", err) } // Setup the indexes correctly if existing != nil { exist := existing.(*structs.Node) node.CreateIndex = exist.CreateIndex node.ModifyIndex = index // Retain node events that have already been set on the node node.Events = exist.Events // If we are transitioning from down, record the re-registration if exist.Status == structs.NodeStatusDown && node.Status != structs.NodeStatusDown { appendNodeEvents(index, node, []*structs.NodeEvent{ structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster). SetMessage(NodeRegisterEventReregistered). SetTimestamp(time.Unix(node.StatusUpdatedAt, 0))}) } node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility node.DrainStrategy = exist.DrainStrategy // Retain the drain strategy node.LastDrain = exist.LastDrain // Retain the drain metadata } else { // Because this is the first time the node is being registered, we should // also create a node registration event nodeEvent := structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster). SetMessage(NodeRegisterEventRegistered). SetTimestamp(time.Unix(node.StatusUpdatedAt, 0)) node.Events = []*structs.NodeEvent{nodeEvent} node.CreateIndex = index node.ModifyIndex = index } // Insert the node if err := txn.Insert("nodes", node); err != nil { return fmt.Errorf("node insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } if err := upsertCSIPluginsForNode(txn, node, index); err != nil { return fmt.Errorf("csi plugin update failed: %v", err) } return nil } // DeleteNode deregisters a batch of nodes func (s *StateStore) DeleteNode(msgType structs.MessageType, index uint64, nodes []string) error { txn := s.db.WriteTxn(index) defer txn.Abort() err := deleteNodeTxn(txn, index, nodes) if err != nil { return nil } return txn.Commit() } func deleteNodeTxn(txn *txn, index uint64, nodes []string) error { if len(nodes) == 0 { return fmt.Errorf("node ids missing") } for _, nodeID := range nodes { existing, err := txn.First("nodes", "id", nodeID) if err != nil { return fmt.Errorf("node lookup failed: %s: %v", nodeID, err) } if existing == nil { return fmt.Errorf("node not found: %s", nodeID) } // Delete the node if err := txn.Delete("nodes", existing); err != nil { return fmt.Errorf("node delete failed: %s: %v", nodeID, err) } node := existing.(*structs.Node) if err := deleteNodeCSIPlugins(txn, node, index); err != nil { return fmt.Errorf("csi plugin delete failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // UpdateNodeStatus is used to update the status of a node func (s *StateStore) UpdateNodeStatus(msgType structs.MessageType, index uint64, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() if err := s.updateNodeStatusTxn(txn, nodeID, status, updatedAt, event); err != nil { return err } return txn.Commit() } func (s *StateStore) updateNodeStatusTxn(txn *txn, nodeID, status string, updatedAt int64, event *structs.NodeEvent) error { // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { return fmt.Errorf("node lookup failed: %v", err) } if existing == nil { return fmt.Errorf("node not found") } // Copy the existing node existingNode := existing.(*structs.Node) copyNode := existingNode.Copy() copyNode.StatusUpdatedAt = updatedAt // Add the event if given if event != nil { appendNodeEvents(txn.Index, copyNode, []*structs.NodeEvent{event}) } // Update the status in the copy copyNode.Status = status copyNode.ModifyIndex = txn.Index // Insert the node if err := txn.Insert("nodes", copyNode); err != nil { return fmt.Errorf("node update failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"nodes", txn.Index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // BatchUpdateNodeDrain is used to update the drain of a node set of nodes. // This is currently only called when node drain is completed by the drainer. func (s *StateStore) BatchUpdateNodeDrain(msgType structs.MessageType, index uint64, updatedAt int64, updates map[string]*structs.DrainUpdate, events map[string]*structs.NodeEvent) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() for node, update := range updates { if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible, updatedAt, events[node], nil, "", true); err != nil { return err } } return txn.Commit() } // UpdateNodeDrain is used to update the drain of a node func (s *StateStore) UpdateNodeDrain(msgType structs.MessageType, index uint64, nodeID string, drain *structs.DrainStrategy, markEligible bool, updatedAt int64, event *structs.NodeEvent, drainMeta map[string]string, accessorId string) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible, updatedAt, event, drainMeta, accessorId, false); err != nil { return err } return txn.Commit() } func (s *StateStore) updateNodeDrainImpl(txn *txn, index uint64, nodeID string, drain *structs.DrainStrategy, markEligible bool, updatedAt int64, event *structs.NodeEvent, drainMeta map[string]string, accessorId string, drainCompleted bool) error { // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { return fmt.Errorf("node lookup failed: %v", err) } if existing == nil { return fmt.Errorf("node not found") } // Copy the existing node existingNode := existing.(*structs.Node) updatedNode := existingNode.Copy() updatedNode.StatusUpdatedAt = updatedAt // Add the event if given if event != nil { appendNodeEvents(index, updatedNode, []*structs.NodeEvent{event}) } // Update the drain in the copy updatedNode.DrainStrategy = drain if drain != nil { updatedNode.SchedulingEligibility = structs.NodeSchedulingIneligible } else if markEligible { updatedNode.SchedulingEligibility = structs.NodeSchedulingEligible } // Update LastDrain updateTime := time.Unix(updatedAt, 0) // if drain strategy isn't set before or after, this wasn't a drain operation // in that case, we don't care about .LastDrain drainNoop := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy == nil // otherwise, when done with this method, updatedNode.LastDrain should be set // if starting a new drain operation, create a new LastDrain. otherwise, update the existing one. startedDraining := existingNode.DrainStrategy == nil && updatedNode.DrainStrategy != nil if !drainNoop { if startedDraining { updatedNode.LastDrain = &structs.DrainMetadata{ StartedAt: updateTime, Meta: drainMeta, } } else if updatedNode.LastDrain == nil { // if already draining and LastDrain doesn't exist, we need to create a new one // this could happen if we upgraded to 1.1.x during a drain updatedNode.LastDrain = &structs.DrainMetadata{ // we don't have sub-second accuracy on these fields, so truncate this StartedAt: time.Unix(existingNode.DrainStrategy.StartedAt.Unix(), 0), Meta: drainMeta, } } updatedNode.LastDrain.UpdatedAt = updateTime // won't have new metadata on drain complete; keep the existing operator-provided metadata // also, keep existing if they didn't provide it if len(drainMeta) != 0 { updatedNode.LastDrain.Meta = drainMeta } // we won't have an accessor ID on drain complete, so don't overwrite the existing one if accessorId != "" { updatedNode.LastDrain.AccessorID = accessorId } if updatedNode.DrainStrategy != nil { updatedNode.LastDrain.Status = structs.DrainStatusDraining } else if drainCompleted { updatedNode.LastDrain.Status = structs.DrainStatusComplete } else { updatedNode.LastDrain.Status = structs.DrainStatusCanceled } } updatedNode.ModifyIndex = index // Insert the node if err := txn.Insert("nodes", updatedNode); err != nil { return fmt.Errorf("node update failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // UpdateNodeEligibility is used to update the scheduling eligibility of a node func (s *StateStore) UpdateNodeEligibility(msgType structs.MessageType, index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() if err := s.updateNodeEligibilityImpl(index, nodeID, eligibility, updatedAt, event, txn); err != nil { return err } return txn.Commit() } func (s *StateStore) updateNodeEligibilityImpl(index uint64, nodeID string, eligibility string, updatedAt int64, event *structs.NodeEvent, txn *txn) error { // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { return fmt.Errorf("node lookup failed: %v", err) } if existing == nil { return fmt.Errorf("node not found") } // Copy the existing node existingNode := existing.(*structs.Node) copyNode := existingNode.Copy() copyNode.StatusUpdatedAt = updatedAt // Add the event if given if event != nil { appendNodeEvents(index, copyNode, []*structs.NodeEvent{event}) } // Check if this is a valid action if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible { return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining") } // Update the eligibility in the copy copyNode.SchedulingEligibility = eligibility copyNode.ModifyIndex = index // Insert the node if err := txn.Insert("nodes", copyNode); err != nil { return fmt.Errorf("node update failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // UpsertNodeEvents adds the node events to the nodes, rotating events as // necessary. func (s *StateStore) UpsertNodeEvents(msgType structs.MessageType, index uint64, nodeEvents map[string][]*structs.NodeEvent) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() for nodeID, events := range nodeEvents { if err := s.upsertNodeEvents(index, nodeID, events, txn); err != nil { return err } } return txn.Commit() } // upsertNodeEvent upserts a node event for a respective node. It also maintains // that a fixed number of node events are ever stored simultaneously, deleting // older events once this bound has been reached. func (s *StateStore) upsertNodeEvents(index uint64, nodeID string, events []*structs.NodeEvent, txn *txn) error { // Lookup the node existing, err := txn.First("nodes", "id", nodeID) if err != nil { return fmt.Errorf("node lookup failed: %v", err) } if existing == nil { return fmt.Errorf("node not found") } // Copy the existing node existingNode := existing.(*structs.Node) copyNode := existingNode.Copy() appendNodeEvents(index, copyNode, events) // Insert the node if err := txn.Insert("nodes", copyNode); err != nil { return fmt.Errorf("node update failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // appendNodeEvents is a helper that takes a node and new events and appends // them, pruning older events as needed. func appendNodeEvents(index uint64, node *structs.Node, events []*structs.NodeEvent) { // Add the events, updating the indexes for _, e := range events { e.CreateIndex = index node.Events = append(node.Events, e) } // Keep node events pruned to not exceed the max allowed if l := len(node.Events); l > structs.MaxRetainedNodeEvents { delta := l - structs.MaxRetainedNodeEvents node.Events = node.Events[delta:] } } // upsertCSIPluginsForNode indexes csi plugins for volume retrieval, with health. It's called // on upsertNodeEvents, so that event driven health changes are updated func upsertCSIPluginsForNode(txn *txn, node *structs.Node, index uint64) error { upsertFn := func(info *structs.CSIInfo) error { raw, err := txn.First("csi_plugins", "id", info.PluginID) if err != nil { return fmt.Errorf("csi_plugin lookup error: %s %v", info.PluginID, err) } var plug *structs.CSIPlugin if raw != nil { plug = raw.(*structs.CSIPlugin).Copy() } else { if !info.Healthy { // we don't want to create new plugins for unhealthy // allocs, otherwise we'd recreate the plugin when we // get the update for the alloc becoming terminal return nil } plug = structs.NewCSIPlugin(info.PluginID, index) } // the plugin may have been created by the job being updated, in which case // this data will not be configured, it's only available to the fingerprint // system plug.Provider = info.Provider plug.Version = info.ProviderVersion err = plug.AddPlugin(node.ID, info) if err != nil { return err } plug.ModifyIndex = index err = txn.Insert("csi_plugins", plug) if err != nil { return fmt.Errorf("csi_plugins insert error: %v", err) } return nil } inUseController := map[string]struct{}{} inUseNode := map[string]struct{}{} for _, info := range node.CSIControllerPlugins { err := upsertFn(info) if err != nil { return err } inUseController[info.PluginID] = struct{}{} } for _, info := range node.CSINodePlugins { err := upsertFn(info) if err != nil { return err } inUseNode[info.PluginID] = struct{}{} } // remove the client node from any plugin that's not // running on it. iter, err := txn.Get("csi_plugins", "id") if err != nil { return fmt.Errorf("csi_plugins lookup failed: %v", err) } for { raw := iter.Next() if raw == nil { break } plug, ok := raw.(*structs.CSIPlugin) if !ok { continue } plug = plug.Copy() var hadDelete bool if _, ok := inUseController[plug.ID]; !ok { if _, asController := plug.Controllers[node.ID]; asController { err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeController) if err != nil { return err } hadDelete = true } } if _, ok := inUseNode[plug.ID]; !ok { if _, asNode := plug.Nodes[node.ID]; asNode { err := plug.DeleteNodeForType(node.ID, structs.CSIPluginTypeNode) if err != nil { return err } hadDelete = true } } // we check this flag both for performance and to make sure we // don't delete a plugin when registering a node plugin but // no controller if hadDelete { err = updateOrGCPlugin(index, txn, plug) if err != nil { return err } } } if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // deleteNodeCSIPlugins cleans up CSIInfo node health status, called in DeleteNode func deleteNodeCSIPlugins(txn *txn, node *structs.Node, index uint64) error { if len(node.CSIControllerPlugins) == 0 && len(node.CSINodePlugins) == 0 { return nil } names := map[string]struct{}{} for _, info := range node.CSIControllerPlugins { names[info.PluginID] = struct{}{} } for _, info := range node.CSINodePlugins { names[info.PluginID] = struct{}{} } for id := range names { raw, err := txn.First("csi_plugins", "id", id) if err != nil { return fmt.Errorf("csi_plugins lookup error %s: %v", id, err) } if raw == nil { // plugin may have been deregistered but we didn't // update the fingerprint yet continue } plug := raw.(*structs.CSIPlugin).Copy() err = plug.DeleteNode(node.ID) if err != nil { return err } err = updateOrGCPlugin(index, txn, plug) if err != nil { return err } } if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // updateOrGCPlugin updates a plugin but will delete it if the plugin is empty func updateOrGCPlugin(index uint64, txn Txn, plug *structs.CSIPlugin) error { plug.ModifyIndex = index if plug.IsEmpty() { err := txn.Delete("csi_plugins", plug) if err != nil { return fmt.Errorf("csi_plugins delete error: %v", err) } } else { err := txn.Insert("csi_plugins", plug) if err != nil { return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err) } } return nil } // deleteJobFromPlugins removes the allocations of this job from any plugins the job is // running, possibly deleting the plugin if it's no longer in use. It's called in DeleteJobTxn func (s *StateStore) deleteJobFromPlugins(index uint64, txn Txn, job *structs.Job) error { ws := memdb.NewWatchSet() summary, err := s.JobSummaryByID(ws, job.Namespace, job.ID) if err != nil { return fmt.Errorf("error getting job summary: %v", err) } allocs, err := s.AllocsByJob(ws, job.Namespace, job.ID, false) if err != nil { return fmt.Errorf("error getting allocations: %v", err) } type pair struct { pluginID string alloc *structs.Allocation } plugAllocs := []*pair{} found := map[string]struct{}{} // Find plugins for allocs that belong to this job for _, a := range allocs { tg := a.Job.LookupTaskGroup(a.TaskGroup) found[tg.Name] = struct{}{} for _, t := range tg.Tasks { if t.CSIPluginConfig == nil { continue } plugAllocs = append(plugAllocs, &pair{ pluginID: t.CSIPluginConfig.ID, alloc: a, }) } } // Find any plugins that do not yet have allocs for this job for _, tg := range job.TaskGroups { if _, ok := found[tg.Name]; ok { continue } for _, t := range tg.Tasks { if t.CSIPluginConfig == nil { continue } plugAllocs = append(plugAllocs, &pair{ pluginID: t.CSIPluginConfig.ID, }) } } plugins := map[string]*structs.CSIPlugin{} for _, x := range plugAllocs { plug, ok := plugins[x.pluginID] if !ok { plug, err = s.CSIPluginByIDTxn(txn, nil, x.pluginID) if err != nil { return fmt.Errorf("error getting plugin: %s, %v", x.pluginID, err) } if plug == nil { // plugin was never successfully registered or has been // GC'd out from under us continue } // only copy once, so we update the same plugin on each alloc plugins[x.pluginID] = plug.Copy() plug = plugins[x.pluginID] } if x.alloc == nil { continue } err := plug.DeleteAlloc(x.alloc.ID, x.alloc.NodeID) if err != nil { return err } } for _, plug := range plugins { plug.DeleteJob(job, summary) err = updateOrGCPlugin(index, txn, plug) if err != nil { return err } } if len(plugins) > 0 { if err = txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } return nil } // NodeByID is used to lookup a node by ID func (s *StateStore) NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("nodes", "id", nodeID) if err != nil { return nil, fmt.Errorf("node lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.Node), nil } return nil, nil } // NodesByIDPrefix is used to lookup nodes by prefix func (s *StateStore) NodesByIDPrefix(ws memdb.WatchSet, nodeID string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("nodes", "id_prefix", nodeID) if err != nil { return nil, fmt.Errorf("node lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // NodeBySecretID is used to lookup a node by SecretID func (s *StateStore) NodeBySecretID(ws memdb.WatchSet, secretID string) (*structs.Node, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("nodes", "secret_id", secretID) if err != nil { return nil, fmt.Errorf("node lookup by SecretID failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.Node), nil } return nil, nil } // Nodes returns an iterator over all the nodes func (s *StateStore) Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire nodes table iter, err := txn.Get("nodes", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // UpsertJob is used to register a job or update a job definition func (s *StateStore) UpsertJob(msgType structs.MessageType, index uint64, job *structs.Job) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() if err := s.upsertJobImpl(index, job, false, txn); err != nil { return err } return txn.Commit() } // UpsertJobTxn is used to register a job or update a job definition, like UpsertJob, // but in a transaction. Useful for when making multiple modifications atomically func (s *StateStore) UpsertJobTxn(index uint64, job *structs.Job, txn Txn) error { return s.upsertJobImpl(index, job, false, txn) } // upsertJobImpl is the implementation for registering a job or updating a job definition func (s *StateStore) upsertJobImpl(index uint64, job *structs.Job, keepVersion bool, txn *txn) error { // Assert the namespace exists if exists, err := s.namespaceExists(txn, job.Namespace); err != nil { return err } else if !exists { return fmt.Errorf("job %q is in nonexistent namespace %q", job.ID, job.Namespace) } // Check if the job already exists existing, err := txn.First("jobs", "id", job.Namespace, job.ID) var existingJob *structs.Job if err != nil { return fmt.Errorf("job lookup failed: %v", err) } // Setup the indexes correctly if existing != nil { job.CreateIndex = existing.(*structs.Job).CreateIndex job.ModifyIndex = index existingJob = existing.(*structs.Job) // Bump the version unless asked to keep it. This should only be done // when changing an internal field such as Stable. A spec change should // always come with a version bump if !keepVersion { job.JobModifyIndex = index if job.Version <= existingJob.Version { job.Version = existingJob.Version + 1 } } // Compute the job status var err error job.Status, err = s.getJobStatus(txn, job, false) if err != nil { return fmt.Errorf("setting job status for %q failed: %v", job.ID, err) } } else { job.CreateIndex = index job.ModifyIndex = index job.JobModifyIndex = index if err := s.setJobStatus(index, txn, job, false, ""); err != nil { return fmt.Errorf("setting job status for %q failed: %v", job.ID, err) } // Have to get the job again since it could have been updated updated, err := txn.First("jobs", "id", job.Namespace, job.ID) if err != nil { return fmt.Errorf("job lookup failed: %v", err) } if updated != nil { job = updated.(*structs.Job) } } if err := s.updateSummaryWithJob(index, job, txn); err != nil { return fmt.Errorf("unable to create job summary: %v", err) } if err := s.upsertJobVersion(index, job, txn); err != nil { return fmt.Errorf("unable to upsert job into job_version table: %v", err) } if err := s.updateJobScalingPolicies(index, job, txn); err != nil { return fmt.Errorf("unable to update job scaling policies: %v", err) } if err := s.updateJobRecommendations(index, txn, existingJob, job); err != nil { return fmt.Errorf("unable to update job recommendations: %v", err) } if err := s.updateJobCSIPlugins(index, job, existingJob, txn); err != nil { return fmt.Errorf("unable to update job csi plugins: %v", err) } // Insert the job if err := txn.Insert("jobs", job); err != nil { return fmt.Errorf("job insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // DeleteJob is used to deregister a job func (s *StateStore) DeleteJob(index uint64, namespace, jobID string) error { txn := s.db.WriteTxn(index) defer txn.Abort() err := s.DeleteJobTxn(index, namespace, jobID, txn) if err == nil { return txn.Commit() } return err } // DeleteJobTxn is used to deregister a job, like DeleteJob, // but in a transaction. Useful for when making multiple modifications atomically func (s *StateStore) DeleteJobTxn(index uint64, namespace, jobID string, txn Txn) error { // Lookup the node existing, err := txn.First("jobs", "id", namespace, jobID) if err != nil { return fmt.Errorf("job lookup failed: %v", err) } if existing == nil { return fmt.Errorf("job not found") } // Check if we should update a parent job summary job := existing.(*structs.Job) if job.ParentID != "" { summaryRaw, err := txn.First("job_summary", "id", namespace, job.ParentID) if err != nil { return fmt.Errorf("unable to retrieve summary for parent job: %v", err) } // Only continue if the summary exists. It could not exist if the parent // job was removed if summaryRaw != nil { existing := summaryRaw.(*structs.JobSummary) pSummary := existing.Copy() if pSummary.Children != nil { modified := false switch job.Status { case structs.JobStatusPending: pSummary.Children.Pending-- pSummary.Children.Dead++ modified = true case structs.JobStatusRunning: pSummary.Children.Running-- pSummary.Children.Dead++ modified = true case structs.JobStatusDead: default: return fmt.Errorf("unknown old job status %q", job.Status) } if modified { // Update the modify index pSummary.ModifyIndex = index // Insert the summary if err := txn.Insert("job_summary", pSummary); err != nil { return fmt.Errorf("job summary insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } } } } // Delete the job if err := txn.Delete("jobs", existing); err != nil { return fmt.Errorf("job delete failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // Delete the job versions if err := s.deleteJobVersions(index, job, txn); err != nil { return err } // Cleanup plugins registered by this job, before we delete the summary err = s.deleteJobFromPlugins(index, txn, job) if err != nil { return fmt.Errorf("deleting job from plugin: %v", err) } // Delete the job summary if _, err = txn.DeleteAll("job_summary", "id", namespace, jobID); err != nil { return fmt.Errorf("deleting job summary failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // Delete any remaining job scaling policies if err := s.deleteJobScalingPolicies(index, job, txn); err != nil { return fmt.Errorf("deleting job scaling policies failed: %v", err) } // Delete any job recommendations if err := s.deleteRecommendationsByJob(index, txn, job); err != nil { return fmt.Errorf("deleting job recommendatons failed: %v", err) } // Delete the scaling events if _, err = txn.DeleteAll("scaling_event", "id", namespace, jobID); err != nil { return fmt.Errorf("deleting job scaling events failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"scaling_event", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // deleteJobScalingPolicies deletes any scaling policies associated with the job func (s *StateStore) deleteJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error { iter, err := s.ScalingPoliciesByJobTxn(nil, job.Namespace, job.ID, txn) if err != nil { return fmt.Errorf("getting job scaling policies for deletion failed: %v", err) } // Put them into a slice so there are no safety concerns while actually // performing the deletes policies := []interface{}{} for { raw := iter.Next() if raw == nil { break } policies = append(policies, raw) } // Do the deletes for _, p := range policies { if err := txn.Delete("scaling_policy", p); err != nil { return fmt.Errorf("deleting scaling policy failed: %v", err) } } if len(policies) > 0 { if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } return nil } // deleteJobVersions deletes all versions of the given job. func (s *StateStore) deleteJobVersions(index uint64, job *structs.Job, txn *txn) error { iter, err := txn.Get("job_version", "id_prefix", job.Namespace, job.ID) if err != nil { return err } // Put them into a slice so there are no safety concerns while actually // performing the deletes jobs := []*structs.Job{} for { raw := iter.Next() if raw == nil { break } // Ensure the ID is an exact match j := raw.(*structs.Job) if j.ID != job.ID { continue } jobs = append(jobs, j) } // Do the deletes for _, j := range jobs { if err := txn.Delete("job_version", j); err != nil { return fmt.Errorf("deleting job versions failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // upsertJobVersion inserts a job into its historic version table and limits the // number of job versions that are tracked. func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *txn) error { // Insert the job if err := txn.Insert("job_version", job); err != nil { return fmt.Errorf("failed to insert job into job_version table: %v", err) } if err := txn.Insert("index", &IndexEntry{"job_version", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // Get all the historic jobs for this ID all, err := s.jobVersionByID(txn, nil, job.Namespace, job.ID) if err != nil { return fmt.Errorf("failed to look up job versions for %q: %v", job.ID, err) } // If we are below the limit there is no GCing to be done if len(all) <= structs.JobTrackedVersions { return nil } // We have to delete a historic job to make room. // Find index of the highest versioned stable job stableIdx := -1 for i, j := range all { if j.Stable { stableIdx = i break } } // If the stable job is the oldest version, do a swap to bring it into the // keep set. max := structs.JobTrackedVersions if stableIdx == max { all[max-1], all[max] = all[max], all[max-1] } // Delete the job outside of the set that are being kept. d := all[max] if err := txn.Delete("job_version", d); err != nil { return fmt.Errorf("failed to delete job %v (%d) from job_version", d.ID, d.Version) } return nil } // JobByID is used to lookup a job by its ID. JobByID returns the current/latest job // version. func (s *StateStore) JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error) { txn := s.db.ReadTxn() return s.JobByIDTxn(ws, namespace, id, txn) } // JobByIDTxn is used to lookup a job by its ID, like JobByID. JobByID returns the job version // accessible through in the transaction func (s *StateStore) JobByIDTxn(ws memdb.WatchSet, namespace, id string, txn Txn) (*structs.Job, error) { watchCh, existing, err := txn.FirstWatch("jobs", "id", namespace, id) if err != nil { return nil, fmt.Errorf("job lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.Job), nil } return nil, nil } // JobsByIDPrefix is used to lookup a job by prefix. If querying all namespaces // the prefix will not be filtered by an index. func (s *StateStore) JobsByIDPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) { if namespace == structs.AllNamespacesSentinel { return s.jobsByIDPrefixAllNamespaces(ws, id) } txn := s.db.ReadTxn() iter, err := txn.Get("jobs", "id_prefix", namespace, id) if err != nil { return nil, fmt.Errorf("job lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } func (s *StateStore) jobsByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire jobs table iter, err := txn.Get("jobs", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) // Filter the iterator by ID prefix f := func(raw interface{}) bool { job, ok := raw.(*structs.Job) if !ok { return true } return !strings.HasPrefix(job.ID, prefix) } wrap := memdb.NewFilterIterator(iter, f) return wrap, nil } // JobVersionsByID returns all the tracked versions of a job. func (s *StateStore) JobVersionsByID(ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) { txn := s.db.ReadTxn() return s.jobVersionByID(txn, ws, namespace, id) } // jobVersionByID is the underlying implementation for retrieving all tracked // versions of a job and is called under an existing transaction. A watch set // can optionally be passed in to add the job histories to the watch set. func (s *StateStore) jobVersionByID(txn *txn, ws memdb.WatchSet, namespace, id string) ([]*structs.Job, error) { // Get all the historic jobs for this ID iter, err := txn.Get("job_version", "id_prefix", namespace, id) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var all []*structs.Job for { raw := iter.Next() if raw == nil { break } // Ensure the ID is an exact match j := raw.(*structs.Job) if j.ID != id { continue } all = append(all, j) } // Sort in reverse order so that the highest version is first sort.Slice(all, func(i, j int) bool { return all[i].Version > all[j].Version }) return all, nil } // JobByIDAndVersion returns the job identified by its ID and Version. The // passed watchset may be nil. func (s *StateStore) JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error) { txn := s.db.ReadTxn() return s.jobByIDAndVersionImpl(ws, namespace, id, version, txn) } // jobByIDAndVersionImpl returns the job identified by its ID and Version. The // passed watchset may be nil. func (s *StateStore) jobByIDAndVersionImpl(ws memdb.WatchSet, namespace, id string, version uint64, txn *txn) (*structs.Job, error) { watchCh, existing, err := txn.FirstWatch("job_version", "id", namespace, id, version) if err != nil { return nil, err } ws.Add(watchCh) if existing != nil { job := existing.(*structs.Job) return job, nil } return nil, nil } func (s *StateStore) JobVersions(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire deployments table iter, err := txn.Get("job_version", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // Jobs returns an iterator over all the jobs func (s *StateStore) Jobs(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire jobs table iter, err := txn.Get("jobs", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // JobsByNamespace returns an iterator over all the jobs for the given namespace func (s *StateStore) JobsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() return s.jobsByNamespaceImpl(ws, namespace, txn) } // jobsByNamespaceImpl returns an iterator over all the jobs for the given namespace func (s *StateStore) jobsByNamespaceImpl(ws memdb.WatchSet, namespace string, txn *txn) (memdb.ResultIterator, error) { // Walk the entire jobs table iter, err := txn.Get("jobs", "id_prefix", namespace, "") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // JobsByPeriodic returns an iterator over all the periodic or non-periodic jobs. func (s *StateStore) JobsByPeriodic(ws memdb.WatchSet, periodic bool) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("jobs", "periodic", periodic) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // JobsByScheduler returns an iterator over all the jobs with the specific // scheduler type. func (s *StateStore) JobsByScheduler(ws memdb.WatchSet, schedulerType string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Return an iterator for jobs with the specific type. iter, err := txn.Get("jobs", "type", schedulerType) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // JobsByGC returns an iterator over all jobs eligible or ineligible for garbage // collection. func (s *StateStore) JobsByGC(ws memdb.WatchSet, gc bool) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("jobs", "gc", gc) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // JobSummaryByID returns a job summary object which matches a specific id. func (s *StateStore) JobSummaryByID(ws memdb.WatchSet, namespace, jobID string) (*structs.JobSummary, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("job_summary", "id", namespace, jobID) if err != nil { return nil, err } ws.Add(watchCh) if existing != nil { summary := existing.(*structs.JobSummary) return summary, nil } return nil, nil } // JobSummaries walks the entire job summary table and returns all the job // summary objects func (s *StateStore) JobSummaries(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("job_summary", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // JobSummaryByPrefix is used to look up Job Summary by id prefix func (s *StateStore) JobSummaryByPrefix(ws memdb.WatchSet, namespace, id string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("job_summary", "id_prefix", namespace, id) if err != nil { return nil, fmt.Errorf("job_summary lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // UpsertCSIVolume inserts a volume in the state store. func (s *StateStore) UpsertCSIVolume(index uint64, volumes []*structs.CSIVolume) error { txn := s.db.WriteTxn(index) defer txn.Abort() for _, v := range volumes { if exists, err := s.namespaceExists(txn, v.Namespace); err != nil { return err } else if !exists { return fmt.Errorf("volume %s is in nonexistent namespace %s", v.ID, v.Namespace) } obj, err := txn.First("csi_volumes", "id", v.Namespace, v.ID) if err != nil { return fmt.Errorf("volume existence check error: %v", err) } if obj != nil { // Allow some properties of a volume to be updated in place, but // prevent accidentally overwriting important properties, or // overwriting a volume in use old := obj.(*structs.CSIVolume) if old.ExternalID != v.ExternalID || old.PluginID != v.PluginID || old.Provider != v.Provider { return fmt.Errorf("volume identity cannot be updated: %s", v.ID) } s.CSIVolumeDenormalize(nil, old.Copy()) if old.InUse() { return fmt.Errorf("volume cannot be updated while in use") } v.CreateIndex = old.CreateIndex v.ModifyIndex = index } else { v.CreateIndex = index v.ModifyIndex = index } // Allocations are copy on write, so we want to keep the Allocation ID // but we need to clear the pointer so that we don't store it when we // write the volume to the state store. We'll get it from the db in // denormalize. for allocID := range v.ReadAllocs { v.ReadAllocs[allocID] = nil } for allocID := range v.WriteAllocs { v.WriteAllocs[allocID] = nil } err = txn.Insert("csi_volumes", v) if err != nil { return fmt.Errorf("volume insert: %v", err) } } if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // CSIVolumes returns the unfiltered list of all volumes. Caller should // snapshot if it wants to also denormalize the plugins. func (s *StateStore) CSIVolumes(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() defer txn.Abort() iter, err := txn.Get("csi_volumes", "id") if err != nil { return nil, fmt.Errorf("csi_volumes lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // CSIVolumeByID is used to lookup a single volume. Returns a copy of the // volume because its plugins and allocations are denormalized to provide // accurate Health. func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*structs.CSIVolume, error) { txn := s.db.ReadTxn() watchCh, obj, err := txn.FirstWatch("csi_volumes", "id", namespace, id) if err != nil { return nil, fmt.Errorf("volume lookup failed for %s: %v", id, err) } ws.Add(watchCh) if obj == nil { return nil, nil } vol := obj.(*structs.CSIVolume) // we return the volume with the plugins denormalized by default, // because the scheduler needs them for feasibility checking return s.csiVolumeDenormalizePluginsTxn(txn, vol.Copy()) } // CSIVolumesByPluginID looks up csi_volumes by pluginID. Caller should // snapshot if it wants to also denormalize the plugins. func (s *StateStore) CSIVolumesByPluginID(ws memdb.WatchSet, namespace, prefix, pluginID string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("csi_volumes", "plugin_id", pluginID) if err != nil { return nil, fmt.Errorf("volume lookup failed: %v", err) } // Filter the iterator by namespace f := func(raw interface{}) bool { v, ok := raw.(*structs.CSIVolume) if !ok { return false } return v.Namespace != namespace && strings.HasPrefix(v.ID, prefix) } wrap := memdb.NewFilterIterator(iter, f) return wrap, nil } // CSIVolumesByIDPrefix supports search. Caller should snapshot if it wants to // also denormalize the plugins. If using a prefix with the wildcard namespace, // the results will not use the index prefix. func (s *StateStore) CSIVolumesByIDPrefix(ws memdb.WatchSet, namespace, volumeID string) (memdb.ResultIterator, error) { if namespace == structs.AllNamespacesSentinel { return s.csiVolumeByIDPrefixAllNamespaces(ws, volumeID) } txn := s.db.ReadTxn() iter, err := txn.Get("csi_volumes", "id_prefix", namespace, volumeID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } func (s *StateStore) csiVolumeByIDPrefixAllNamespaces(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire csi_volumes table iter, err := txn.Get("csi_volumes", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) // Filter the iterator by ID prefix f := func(raw interface{}) bool { v, ok := raw.(*structs.CSIVolume) if !ok { return false } return !strings.HasPrefix(v.ID, prefix) } wrap := memdb.NewFilterIterator(iter, f) return wrap, nil } // CSIVolumesByNodeID looks up CSIVolumes in use on a node. Caller should // snapshot if it wants to also denormalize the plugins. func (s *StateStore) CSIVolumesByNodeID(ws memdb.WatchSet, prefix, nodeID string) (memdb.ResultIterator, error) { allocs, err := s.AllocsByNode(ws, nodeID) if err != nil { return nil, fmt.Errorf("alloc lookup failed: %v", err) } // Find volume ids for CSI volumes in running allocs, or allocs that we desire to run ids := map[string]string{} // Map volumeID to Namespace for _, a := range allocs { tg := a.Job.LookupTaskGroup(a.TaskGroup) if !(a.DesiredStatus == structs.AllocDesiredStatusRun || a.ClientStatus == structs.AllocClientStatusRunning) || len(tg.Volumes) == 0 { continue } for _, v := range tg.Volumes { if v.Type != structs.VolumeTypeCSI { continue } ids[v.Source] = a.Namespace } } // Lookup the raw CSIVolumes to match the other list interfaces iter := NewSliceIterator() txn := s.db.ReadTxn() for id, namespace := range ids { if strings.HasPrefix(id, prefix) { watchCh, raw, err := txn.FirstWatch("csi_volumes", "id", namespace, id) if err != nil { return nil, fmt.Errorf("volume lookup failed: %s %v", id, err) } ws.Add(watchCh) iter.Add(raw) } } return iter, nil } // CSIVolumesByNamespace looks up the entire csi_volumes table func (s *StateStore) CSIVolumesByNamespace(ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() return s.csiVolumesByNamespaceImpl(txn, ws, namespace, prefix) } func (s *StateStore) csiVolumesByNamespaceImpl(txn *txn, ws memdb.WatchSet, namespace, prefix string) (memdb.ResultIterator, error) { iter, err := txn.Get("csi_volumes", "id_prefix", namespace, prefix) if err != nil { return nil, fmt.Errorf("volume lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // CSIVolumeClaim updates the volume's claim count and allocation list func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, claim *structs.CSIVolumeClaim) error { txn := s.db.WriteTxn(index) defer txn.Abort() row, err := txn.First("csi_volumes", "id", namespace, id) if err != nil { return fmt.Errorf("volume lookup failed: %s: %v", id, err) } if row == nil { return fmt.Errorf("volume not found: %s", id) } orig, ok := row.(*structs.CSIVolume) if !ok { return fmt.Errorf("volume row conversion error") } var alloc *structs.Allocation if claim.State == structs.CSIVolumeClaimStateTaken { alloc, err = s.allocByIDImpl(txn, nil, claim.AllocationID) if err != nil { s.logger.Error("AllocByID failed", "error", err) return fmt.Errorf(structs.ErrUnknownAllocationPrefix) } if alloc == nil { s.logger.Error("AllocByID failed to find alloc", "alloc_id", claim.AllocationID) if err != nil { return fmt.Errorf(structs.ErrUnknownAllocationPrefix) } } } volume, err := s.csiVolumeDenormalizePluginsTxn(txn, orig.Copy()) if err != nil { return err } volume, err = s.csiVolumeDenormalizeTxn(txn, nil, volume) if err != nil { return err } // in the case of a job deregistration, there will be no allocation ID // for the claim but we still want to write an updated index to the volume // so that volume reaping is triggered if claim.AllocationID != "" { err = volume.Claim(claim, alloc) if err != nil { return err } } volume.ModifyIndex = index // Allocations are copy on write, so we want to keep the Allocation ID // but we need to clear the pointer so that we don't store it when we // write the volume to the state store. We'll get it from the db in // denormalize. for allocID := range volume.ReadAllocs { volume.ReadAllocs[allocID] = nil } for allocID := range volume.WriteAllocs { volume.WriteAllocs[allocID] = nil } if err = txn.Insert("csi_volumes", volume); err != nil { return fmt.Errorf("volume update failed: %s: %v", id, err) } if err = txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // CSIVolumeDeregister removes the volume from the server func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []string, force bool) error { txn := s.db.WriteTxn(index) defer txn.Abort() for _, id := range ids { existing, err := txn.First("csi_volumes", "id", namespace, id) if err != nil { return fmt.Errorf("volume lookup failed: %s: %v", id, err) } if existing == nil { return fmt.Errorf("volume not found: %s", id) } vol, ok := existing.(*structs.CSIVolume) if !ok { return fmt.Errorf("volume row conversion error: %s", id) } // The common case for a volume deregister is when the volume is // unused, but we can also let an operator intervene in the case where // allocations have been stopped but claims can't be freed because // ex. the plugins have all been removed. if vol.InUse() { if !force || !s.volSafeToForce(txn, vol) { return fmt.Errorf("volume in use: %s", id) } } if err = txn.Delete("csi_volumes", existing); err != nil { return fmt.Errorf("volume delete failed: %s: %v", id, err) } } if err := txn.Insert("index", &IndexEntry{"csi_volumes", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // volSafeToForce checks if the any of the remaining allocations // are in a non-terminal state. func (s *StateStore) volSafeToForce(txn Txn, v *structs.CSIVolume) bool { vol, err := s.csiVolumeDenormalizeTxn(txn, nil, v) if err != nil { return false } for _, alloc := range vol.ReadAllocs { if alloc != nil && !alloc.TerminalStatus() { return false } } for _, alloc := range vol.WriteAllocs { if alloc != nil && !alloc.TerminalStatus() { return false } } return true } // CSIVolumeDenormalizePlugins returns a CSIVolume with current health and // plugins, but without allocations. // Use this for current volume metadata, handling lists of volumes. // Use CSIVolumeDenormalize for volumes containing both health and current // allocations. func (s *StateStore) CSIVolumeDenormalizePlugins(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) { if vol == nil { return nil, nil } txn := s.db.ReadTxn() defer txn.Abort() return s.csiVolumeDenormalizePluginsTxn(txn, vol) } // csiVolumeDenormalizePluginsTxn implements // CSIVolumeDenormalizePlugins, inside a transaction. func (s *StateStore) csiVolumeDenormalizePluginsTxn(txn Txn, vol *structs.CSIVolume) (*structs.CSIVolume, error) { if vol == nil { return nil, nil } plug, err := s.CSIPluginByIDTxn(txn, nil, vol.PluginID) if err != nil { return nil, fmt.Errorf("plugin lookup error: %s %v", vol.PluginID, err) } if plug == nil { vol.ControllersHealthy = 0 vol.NodesHealthy = 0 vol.Schedulable = false return vol, nil } vol.Provider = plug.Provider vol.ProviderVersion = plug.Version vol.ControllerRequired = plug.ControllerRequired vol.ControllersHealthy = plug.ControllersHealthy vol.NodesHealthy = plug.NodesHealthy // This value may be stale, but stale is ok vol.ControllersExpected = plug.ControllersExpected vol.NodesExpected = plug.NodesExpected vol.Schedulable = vol.NodesHealthy > 0 if vol.ControllerRequired { vol.Schedulable = vol.ControllersHealthy > 0 && vol.Schedulable } return vol, nil } // CSIVolumeDenormalize returns a CSIVolume with its current // Allocations and Claims, including creating new PastClaims for // terminal or garbage collected allocations. This ensures we have a // consistent state. Note that it mutates the original volume and so // should always be called on a Copy after reading from the state // store. func (s *StateStore) CSIVolumeDenormalize(ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) { txn := s.db.ReadTxn() return s.csiVolumeDenormalizeTxn(txn, ws, vol) } // csiVolumeDenormalizeTxn implements CSIVolumeDenormalize inside a transaction func (s *StateStore) csiVolumeDenormalizeTxn(txn Txn, ws memdb.WatchSet, vol *structs.CSIVolume) (*structs.CSIVolume, error) { if vol == nil { return nil, nil } // note: denormalize mutates the maps we pass in! denormalize := func( currentAllocs map[string]*structs.Allocation, currentClaims, pastClaims map[string]*structs.CSIVolumeClaim, fallbackMode structs.CSIVolumeClaimMode) error { for id := range currentAllocs { a, err := s.allocByIDImpl(txn, ws, id) if err != nil { return err } pastClaim := pastClaims[id] currentClaim := currentClaims[id] if currentClaim == nil { // COMPAT(1.4.0): the CSIVolumeClaim fields were added // after 0.11.1, so claims made before that may be // missing this value. No clusters should see this // anymore, so warn nosily in the logs so that // operators ask us about it. Remove this block and // the now-unused fallbackMode parameter, and return // an error if currentClaim is nil in 1.4.0 s.logger.Warn("volume was missing claim for allocation", "volume_id", vol.ID, "alloc", id) currentClaim = &structs.CSIVolumeClaim{ AllocationID: a.ID, NodeID: a.NodeID, Mode: fallbackMode, State: structs.CSIVolumeClaimStateTaken, } currentClaims[id] = currentClaim } currentAllocs[id] = a if (a == nil || a.TerminalStatus()) && pastClaim == nil { // the alloc is garbage collected but nothing has written a PastClaim, // so create one now pastClaim = &structs.CSIVolumeClaim{ AllocationID: id, NodeID: currentClaim.NodeID, Mode: currentClaim.Mode, State: structs.CSIVolumeClaimStateUnpublishing, AccessMode: currentClaim.AccessMode, AttachmentMode: currentClaim.AttachmentMode, } pastClaims[id] = pastClaim } } return nil } err := denormalize(vol.ReadAllocs, vol.ReadClaims, vol.PastClaims, structs.CSIVolumeClaimRead) if err != nil { return nil, err } err = denormalize(vol.WriteAllocs, vol.WriteClaims, vol.PastClaims, structs.CSIVolumeClaimWrite) if err != nil { return nil, err } // COMPAT: the AccessMode and AttachmentMode fields were added to claims // in 1.1.0, so claims made before that may be missing this value. In this // case, the volume will already have AccessMode/AttachmentMode until it // no longer has any claims, so set from those values for _, claim := range vol.ReadClaims { if claim.AccessMode == "" || claim.AttachmentMode == "" { claim.AccessMode = vol.AccessMode claim.AttachmentMode = vol.AttachmentMode } } for _, claim := range vol.WriteClaims { if claim.AccessMode == "" || claim.AttachmentMode == "" { claim.AccessMode = vol.AccessMode claim.AttachmentMode = vol.AttachmentMode } } return vol, nil } // CSIPlugins returns the unfiltered list of all plugin health status func (s *StateStore) CSIPlugins(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() defer txn.Abort() iter, err := txn.Get("csi_plugins", "id") if err != nil { return nil, fmt.Errorf("csi_plugins lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // CSIPluginsByIDPrefix supports search func (s *StateStore) CSIPluginsByIDPrefix(ws memdb.WatchSet, pluginID string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("csi_plugins", "id_prefix", pluginID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // CSIPluginByID returns a named CSIPlugin. This method creates a new // transaction so you should not call it from within another transaction. func (s *StateStore) CSIPluginByID(ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) { txn := s.db.ReadTxn() plugin, err := s.CSIPluginByIDTxn(txn, ws, id) if err != nil { return nil, err } return plugin, nil } // CSIPluginByIDTxn returns a named CSIPlugin func (s *StateStore) CSIPluginByIDTxn(txn Txn, ws memdb.WatchSet, id string) (*structs.CSIPlugin, error) { watchCh, obj, err := txn.FirstWatch("csi_plugins", "id", id) if err != nil { return nil, fmt.Errorf("csi_plugin lookup failed: %s %v", id, err) } ws.Add(watchCh) if obj != nil { return obj.(*structs.CSIPlugin), nil } return nil, nil } // CSIPluginDenormalize returns a CSIPlugin with allocation details. Always called on a copy of the plugin. func (s *StateStore) CSIPluginDenormalize(ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) { txn := s.db.ReadTxn() return s.CSIPluginDenormalizeTxn(txn, ws, plug) } func (s *StateStore) CSIPluginDenormalizeTxn(txn Txn, ws memdb.WatchSet, plug *structs.CSIPlugin) (*structs.CSIPlugin, error) { if plug == nil { return nil, nil } // Get the unique list of allocation ids ids := map[string]struct{}{} for _, info := range plug.Controllers { ids[info.AllocID] = struct{}{} } for _, info := range plug.Nodes { ids[info.AllocID] = struct{}{} } for id := range ids { alloc, err := s.allocByIDImpl(txn, ws, id) if err != nil { return nil, err } if alloc == nil { continue } plug.Allocations = append(plug.Allocations, alloc.Stub(nil)) } sort.Slice(plug.Allocations, func(i, j int) bool { return plug.Allocations[i].ModifyIndex > plug.Allocations[j].ModifyIndex }) return plug, nil } // UpsertCSIPlugin writes the plugin to the state store. Note: there // is currently no raft message for this, as it's intended to support // testing use cases. func (s *StateStore) UpsertCSIPlugin(index uint64, plug *structs.CSIPlugin) error { txn := s.db.WriteTxn(index) defer txn.Abort() existing, err := txn.First("csi_plugins", "id", plug.ID) if err != nil { return fmt.Errorf("csi_plugin lookup error: %s %v", plug.ID, err) } plug.ModifyIndex = index if existing != nil { plug.CreateIndex = existing.(*structs.CSIPlugin).CreateIndex } err = txn.Insert("csi_plugins", plug) if err != nil { return fmt.Errorf("csi_plugins insert error: %v", err) } if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteCSIPlugin deletes the plugin if it's not in use. func (s *StateStore) DeleteCSIPlugin(index uint64, id string) error { txn := s.db.WriteTxn(index) defer txn.Abort() plug, err := s.CSIPluginByIDTxn(txn, nil, id) if err != nil { return err } if plug == nil { return nil } plug, err = s.CSIPluginDenormalizeTxn(txn, nil, plug.Copy()) if err != nil { return err } if !plug.IsEmpty() { return fmt.Errorf("plugin in use") } err = txn.Delete("csi_plugins", plug) if err != nil { return fmt.Errorf("csi_plugins delete error: %v", err) } return txn.Commit() } // UpsertPeriodicLaunch is used to register a launch or update it. func (s *StateStore) UpsertPeriodicLaunch(index uint64, launch *structs.PeriodicLaunch) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Check if the job already exists existing, err := txn.First("periodic_launch", "id", launch.Namespace, launch.ID) if err != nil { return fmt.Errorf("periodic launch lookup failed: %v", err) } // Setup the indexes correctly if existing != nil { launch.CreateIndex = existing.(*structs.PeriodicLaunch).CreateIndex launch.ModifyIndex = index } else { launch.CreateIndex = index launch.ModifyIndex = index } // Insert the job if err := txn.Insert("periodic_launch", launch); err != nil { return fmt.Errorf("launch insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeletePeriodicLaunch is used to delete the periodic launch func (s *StateStore) DeletePeriodicLaunch(index uint64, namespace, jobID string) error { txn := s.db.WriteTxn(index) defer txn.Abort() err := s.DeletePeriodicLaunchTxn(index, namespace, jobID, txn) if err == nil { return txn.Commit() } return err } // DeletePeriodicLaunchTxn is used to delete the periodic launch, like DeletePeriodicLaunch // but in a transaction. Useful for when making multiple modifications atomically func (s *StateStore) DeletePeriodicLaunchTxn(index uint64, namespace, jobID string, txn Txn) error { // Lookup the launch existing, err := txn.First("periodic_launch", "id", namespace, jobID) if err != nil { return fmt.Errorf("launch lookup failed: %v", err) } if existing == nil { return fmt.Errorf("launch not found") } // Delete the launch if err := txn.Delete("periodic_launch", existing); err != nil { return fmt.Errorf("launch delete failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"periodic_launch", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // PeriodicLaunchByID is used to lookup a periodic launch by the periodic job // ID. func (s *StateStore) PeriodicLaunchByID(ws memdb.WatchSet, namespace, id string) (*structs.PeriodicLaunch, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("periodic_launch", "id", namespace, id) if err != nil { return nil, fmt.Errorf("periodic launch lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.PeriodicLaunch), nil } return nil, nil } // PeriodicLaunches returns an iterator over all the periodic launches func (s *StateStore) PeriodicLaunches(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire table iter, err := txn.Get("periodic_launch", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // UpsertEvals is used to upsert a set of evaluations func (s *StateStore) UpsertEvals(msgType structs.MessageType, index uint64, evals []*structs.Evaluation) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() err := s.UpsertEvalsTxn(index, evals, txn) if err == nil { return txn.Commit() } return err } // UpsertEvalsTxn is used to upsert a set of evaluations, like UpsertEvals but // in a transaction. Useful for when making multiple modifications atomically. func (s *StateStore) UpsertEvalsTxn(index uint64, evals []*structs.Evaluation, txn Txn) error { // Do a nested upsert jobs := make(map[structs.NamespacedID]string, len(evals)) for _, eval := range evals { if err := s.nestedUpsertEval(txn, index, eval); err != nil { return err } tuple := structs.NamespacedID{ ID: eval.JobID, Namespace: eval.Namespace, } jobs[tuple] = "" } // Set the job's status if err := s.setJobStatuses(index, txn, jobs, false); err != nil { return fmt.Errorf("setting job status failed: %v", err) } return nil } // nestedUpsertEvaluation is used to nest an evaluation upsert within a transaction func (s *StateStore) nestedUpsertEval(txn *txn, index uint64, eval *structs.Evaluation) error { // Lookup the evaluation existing, err := txn.First("evals", "id", eval.ID) if err != nil { return fmt.Errorf("eval lookup failed: %v", err) } // Update the indexes if existing != nil { eval.CreateIndex = existing.(*structs.Evaluation).CreateIndex eval.ModifyIndex = index } else { eval.CreateIndex = index eval.ModifyIndex = index } // Update the job summary summaryRaw, err := txn.First("job_summary", "id", eval.Namespace, eval.JobID) if err != nil { return fmt.Errorf("job summary lookup failed: %v", err) } if summaryRaw != nil { js := summaryRaw.(*structs.JobSummary).Copy() hasSummaryChanged := false for tg, num := range eval.QueuedAllocations { if summary, ok := js.Summary[tg]; ok { if summary.Queued != num { summary.Queued = num js.Summary[tg] = summary hasSummaryChanged = true } } else { s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace) } } // Insert the job summary if hasSummaryChanged { js.ModifyIndex = index if err := txn.Insert("job_summary", js); err != nil { return fmt.Errorf("job summary insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } } // Check if the job has any blocked evaluations and cancel them if eval.Status == structs.EvalStatusComplete && len(eval.FailedTGAllocs) == 0 { // Get the blocked evaluation for a job if it exists iter, err := txn.Get("evals", "job", eval.Namespace, eval.JobID, structs.EvalStatusBlocked) if err != nil { return fmt.Errorf("failed to get blocked evals for job %q in namespace %q: %v", eval.JobID, eval.Namespace, err) } var blocked []*structs.Evaluation for { raw := iter.Next() if raw == nil { break } blocked = append(blocked, raw.(*structs.Evaluation)) } // Go through and update the evals for _, eval := range blocked { newEval := eval.Copy() newEval.Status = structs.EvalStatusCancelled newEval.StatusDescription = fmt.Sprintf("evaluation %q successful", newEval.ID) newEval.ModifyIndex = index if err := txn.Insert("evals", newEval); err != nil { return fmt.Errorf("eval insert failed: %v", err) } } } // Insert the eval if err := txn.Insert("evals", eval); err != nil { return fmt.Errorf("eval insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // updateEvalModifyIndex is used to update the modify index of an evaluation that has been // through a scheduler pass. This is done as part of plan apply. It ensures that when a subsequent // scheduler workers process a re-queued evaluation it sees any partial updates from the plan apply. func (s *StateStore) updateEvalModifyIndex(txn *txn, index uint64, evalID string) error { // Lookup the evaluation existing, err := txn.First("evals", "id", evalID) if err != nil { return fmt.Errorf("eval lookup failed: %v", err) } if existing == nil { s.logger.Error("unable to find eval", "eval_id", evalID) return fmt.Errorf("unable to find eval id %q", evalID) } eval := existing.(*structs.Evaluation).Copy() // Update the indexes eval.ModifyIndex = index // Insert the eval if err := txn.Insert("evals", eval); err != nil { return fmt.Errorf("eval insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // EvalIsUserDeleteSafe ensures an evaluation is safe to delete based on its // related allocation and job information. This follows similar, but different // rules to the eval reap checking, to ensure evaluations for running allocs or // allocs which need the evaluation detail are not deleted. // // Returns both a bool and an error so that error in querying the related // objects can be differentiated from reporting that the eval isn't safe to // delete. func (s *StateStore) EvalIsUserDeleteSafe(ws memdb.WatchSet, eval *structs.Evaluation) (bool, error) { job, err := s.JobByID(ws, eval.Namespace, eval.JobID) if err != nil { return false, fmt.Errorf("failed to lookup job for eval: %v", err) } allocs, err := s.AllocsByEval(ws, eval.ID) if err != nil { return false, fmt.Errorf("failed to lookup eval allocs: %v", err) } return isEvalDeleteSafe(allocs, job), nil } func isEvalDeleteSafe(allocs []*structs.Allocation, job *structs.Job) bool { // If the job is deleted, stopped, or dead, all allocs are terminal and // the eval can be deleted. if job == nil || job.Stop || job.Status == structs.JobStatusDead { return true } // Iterate the allocations associated to the eval, if any, and check // whether we can delete the eval. for _, alloc := range allocs { // If the allocation is still classed as running on the client, or // might be, we can't delete. switch alloc.ClientStatus { case structs.AllocClientStatusRunning, structs.AllocClientStatusUnknown: return false } // If the alloc hasn't failed then we don't need to consider it for // rescheduling. Rescheduling needs to copy over information from the // previous alloc so that it can enforce the reschedule policy. if alloc.ClientStatus != structs.AllocClientStatusFailed { continue } var reschedulePolicy *structs.ReschedulePolicy tg := job.LookupTaskGroup(alloc.TaskGroup) if tg != nil { reschedulePolicy = tg.ReschedulePolicy } // No reschedule policy or rescheduling is disabled if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) { continue } // The restart tracking information has not been carried forward. if alloc.NextAllocation == "" { return false } // This task has unlimited rescheduling and the alloc has not been // replaced, so we can't delete the eval yet. if reschedulePolicy.Unlimited { return false } // No restarts have been attempted yet. if alloc.RescheduleTracker == nil || len(alloc.RescheduleTracker.Events) == 0 { return false } } return true } // DeleteEval is used to delete an evaluation func (s *StateStore) DeleteEval(index uint64, evals, allocs []string, userInitiated bool) error { txn := s.db.WriteTxn(index) defer txn.Abort() // If this deletion has been initiated by an operator, ensure the eval // broker is paused. if userInitiated { _, schedConfig, err := s.schedulerConfigTxn(txn) if err != nil { return err } if schedConfig == nil || !schedConfig.PauseEvalBroker { return errors.New("eval broker is enabled; eval broker must be paused to delete evals") } } jobs := make(map[structs.NamespacedID]string, len(evals)) // evalsTableUpdated and allocsTableUpdated allow us to track whether each // table has been modified. This allows us to skip updating the index table // entries if we do not need to. var evalsTableUpdated, allocsTableUpdated bool for _, eval := range evals { existing, err := txn.First("evals", "id", eval) if err != nil { return fmt.Errorf("eval lookup failed: %v", err) } if existing == nil { continue } if err := txn.Delete("evals", existing); err != nil { return fmt.Errorf("eval delete failed: %v", err) } // Mark that we have made a successful modification to the evals // table. evalsTableUpdated = true eval := existing.(*structs.Evaluation) tuple := structs.NamespacedID{ ID: eval.JobID, Namespace: eval.Namespace, } jobs[tuple] = "" } for _, alloc := range allocs { raw, err := txn.First("allocs", "id", alloc) if err != nil { return fmt.Errorf("alloc lookup failed: %v", err) } if raw == nil { continue } if err := txn.Delete("allocs", raw); err != nil { return fmt.Errorf("alloc delete failed: %v", err) } // Mark that we have made a successful modification to the allocs // table. allocsTableUpdated = true } // Update the indexes if evalsTableUpdated { if err := txn.Insert("index", &IndexEntry{"evals", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } if allocsTableUpdated { if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } // Set the job's status if err := s.setJobStatuses(index, txn, jobs, true); err != nil { return fmt.Errorf("setting job status failed: %v", err) } return txn.Commit() } // EvalByID is used to lookup an eval by its ID func (s *StateStore) EvalByID(ws memdb.WatchSet, id string) (*structs.Evaluation, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("evals", "id", id) if err != nil { return nil, fmt.Errorf("eval lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.Evaluation), nil } return nil, nil } // EvalsRelatedToID is used to retrieve the evals that are related (next, // previous, or blocked) to the provided eval ID. func (s *StateStore) EvalsRelatedToID(ws memdb.WatchSet, id string) ([]*structs.EvaluationStub, error) { txn := s.db.ReadTxn() raw, err := txn.First("evals", "id", id) if err != nil { return nil, fmt.Errorf("eval lookup failed: %v", err) } if raw == nil { return nil, nil } eval := raw.(*structs.Evaluation) relatedEvals := []*structs.EvaluationStub{} todo := eval.RelatedIDs() done := map[string]bool{ eval.ID: true, // don't place the requested eval in the related list. } for len(todo) > 0 { // Pop the first value from the todo list. current := todo[0] todo = todo[1:] if current == "" { continue } // Skip value if we already have it in the results. if done[current] { continue } eval, err := s.EvalByID(ws, current) if err != nil { return nil, err } if eval == nil { continue } todo = append(todo, eval.RelatedIDs()...) relatedEvals = append(relatedEvals, eval.Stub()) done[eval.ID] = true } return relatedEvals, nil } // EvalsByIDPrefix is used to lookup evaluations by prefix in a particular // namespace func (s *StateStore) EvalsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var iter memdb.ResultIterator var err error // Get an iterator over all evals by the id prefix switch sort { case SortReverse: iter, err = txn.GetReverse("evals", "id_prefix", id) default: iter, err = txn.Get("evals", "id_prefix", id) } if err != nil { return nil, fmt.Errorf("eval lookup failed: %v", err) } ws.Add(iter.WatchCh()) // Wrap the iterator in a filter wrap := memdb.NewFilterIterator(iter, evalNamespaceFilter(namespace)) return wrap, nil } // evalNamespaceFilter returns a filter function that filters all evaluations // not in the given namespace. func evalNamespaceFilter(namespace string) func(interface{}) bool { return func(raw interface{}) bool { eval, ok := raw.(*structs.Evaluation) if !ok { return true } return namespace != structs.AllNamespacesSentinel && eval.Namespace != namespace } } // EvalsByJob returns all the evaluations by job id func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]*structs.Evaluation, error) { txn := s.db.ReadTxn() // Get an iterator over the node allocations iter, err := txn.Get("evals", "job_prefix", namespace, jobID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Evaluation for { raw := iter.Next() if raw == nil { break } e := raw.(*structs.Evaluation) // Filter non-exact matches if e.JobID != jobID { continue } out = append(out, e) } return out, nil } // Evals returns an iterator over all the evaluations in ascending or descending // order of CreationIndex as determined by the reverse parameter. func (s *StateStore) Evals(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var it memdb.ResultIterator var err error switch sort { case SortReverse: it, err = txn.GetReverse("evals", "create") default: it, err = txn.Get("evals", "create") } if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } // EvalsByNamespace returns an iterator over all evaluations in no particular // order. // // todo(shoenig): can this be removed? func (s *StateStore) EvalsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() it, err := txn.Get("evals", "namespace", namespace) if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } func (s *StateStore) EvalsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var ( it memdb.ResultIterator err error exact = terminate(namespace) ) switch sort { case SortReverse: it, err = txn.GetReverse("evals", "namespace_create_prefix", exact) default: it, err = txn.Get("evals", "namespace_create_prefix", exact) } if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } // UpdateAllocsFromClient is used to update an allocation based on input // from a client. While the schedulers are the authority on the allocation for // most things, some updates are authoritative from the client. Specifically, // the desired state comes from the schedulers, while the actual state comes // from clients. func (s *StateStore) UpdateAllocsFromClient(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Handle each of the updated allocations for _, alloc := range allocs { if err := s.nestedUpdateAllocFromClient(txn, index, alloc); err != nil { return err } } // Update the indexes if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // nestedUpdateAllocFromClient is used to nest an update of an allocation with client status func (s *StateStore) nestedUpdateAllocFromClient(txn *txn, index uint64, alloc *structs.Allocation) error { // Look for existing alloc existing, err := txn.First("allocs", "id", alloc.ID) if err != nil { return fmt.Errorf("alloc lookup failed: %v", err) } // Nothing to do if this does not exist if existing == nil { return nil } exist := existing.(*structs.Allocation) // Copy everything from the existing allocation copyAlloc := exist.Copy() // Pull in anything the client is the authority on copyAlloc.ClientStatus = alloc.ClientStatus copyAlloc.ClientDescription = alloc.ClientDescription copyAlloc.TaskStates = alloc.TaskStates copyAlloc.NetworkStatus = alloc.NetworkStatus // The client can only set its deployment health and timestamp, so just take // those if copyAlloc.DeploymentStatus != nil && alloc.DeploymentStatus != nil { oldHasHealthy := copyAlloc.DeploymentStatus.HasHealth() newHasHealthy := alloc.DeploymentStatus.HasHealth() // We got new health information from the client if newHasHealthy && (!oldHasHealthy || *copyAlloc.DeploymentStatus.Healthy != *alloc.DeploymentStatus.Healthy) { // Updated deployment health and timestamp copyAlloc.DeploymentStatus.Healthy = pointer.Of(*alloc.DeploymentStatus.Healthy) copyAlloc.DeploymentStatus.Timestamp = alloc.DeploymentStatus.Timestamp copyAlloc.DeploymentStatus.ModifyIndex = index } } else if alloc.DeploymentStatus != nil { // First time getting a deployment status so copy everything and just // set the index copyAlloc.DeploymentStatus = alloc.DeploymentStatus.Copy() copyAlloc.DeploymentStatus.ModifyIndex = index } // Update the modify index copyAlloc.ModifyIndex = index // Update the modify time copyAlloc.ModifyTime = alloc.ModifyTime if err := s.updateDeploymentWithAlloc(index, copyAlloc, exist, txn); err != nil { return fmt.Errorf("error updating deployment: %v", err) } if err := s.updateSummaryWithAlloc(index, copyAlloc, exist, txn); err != nil { return fmt.Errorf("error updating job summary: %v", err) } if err := s.updateEntWithAlloc(index, copyAlloc, exist, txn); err != nil { return err } if err := s.updatePluginForTerminalAlloc(index, copyAlloc, txn); err != nil { return err } // Update the allocation if err := txn.Insert("allocs", copyAlloc); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } // Set the job's status forceStatus := "" if !copyAlloc.TerminalStatus() { forceStatus = structs.JobStatusRunning } tuple := structs.NamespacedID{ ID: exist.JobID, Namespace: exist.Namespace, } jobs := map[structs.NamespacedID]string{tuple: forceStatus} if err := s.setJobStatuses(index, txn, jobs, false); err != nil { return fmt.Errorf("setting job status failed: %v", err) } return nil } // UpsertAllocs is used to evict a set of allocations and allocate new ones at // the same time. func (s *StateStore) UpsertAllocs(msgType structs.MessageType, index uint64, allocs []*structs.Allocation) error { txn := s.db.WriteTxn(index) defer txn.Abort() if err := s.upsertAllocsImpl(index, allocs, txn); err != nil { return err } return txn.Commit() } // upsertAllocs is the actual implementation of UpsertAllocs so that it may be // used with an existing transaction. func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation, txn *txn) error { // Handle the allocations jobs := make(map[structs.NamespacedID]string, 1) for _, alloc := range allocs { existing, err := txn.First("allocs", "id", alloc.ID) if err != nil { return fmt.Errorf("alloc lookup failed: %v", err) } exist, _ := existing.(*structs.Allocation) if exist == nil { alloc.CreateIndex = index alloc.ModifyIndex = index alloc.AllocModifyIndex = index if alloc.DeploymentStatus != nil { alloc.DeploymentStatus.ModifyIndex = index } // Issue https://github.com/hashicorp/nomad/issues/2583 uncovered // the a race between a forced garbage collection and the scheduler // marking an allocation as terminal. The issue is that the // allocation from the scheduler has its job normalized and the FSM // will only denormalize if the allocation is not terminal. However // if the allocation is garbage collected, that will result in a // allocation being upserted for the first time without a job // attached. By returning an error here, it will cause the FSM to // error, causing the plan_apply to error and thus causing the // evaluation to be failed. This will force an index refresh that // should solve this issue. if alloc.Job == nil { return fmt.Errorf("attempting to upsert allocation %q without a job", alloc.ID) } } else { alloc.CreateIndex = exist.CreateIndex alloc.ModifyIndex = index alloc.AllocModifyIndex = index // Keep the clients task states alloc.TaskStates = exist.TaskStates // If the scheduler is marking this allocation as lost or unknown we do not // want to reuse the status of the existing allocation. if alloc.ClientStatus != structs.AllocClientStatusLost && alloc.ClientStatus != structs.AllocClientStatusUnknown { alloc.ClientStatus = exist.ClientStatus alloc.ClientDescription = exist.ClientDescription } // The job has been denormalized so re-attach the original job if alloc.Job == nil { alloc.Job = exist.Job } } // OPTIMIZATION: // These should be given a map of new to old allocation and the updates // should be one on all changes. The current implementation causes O(n) // lookups/copies/insertions rather than O(1) if err := s.updateDeploymentWithAlloc(index, alloc, exist, txn); err != nil { return fmt.Errorf("error updating deployment: %v", err) } if err := s.updateSummaryWithAlloc(index, alloc, exist, txn); err != nil { return fmt.Errorf("error updating job summary: %v", err) } if err := s.updateEntWithAlloc(index, alloc, exist, txn); err != nil { return err } if err := s.updatePluginForTerminalAlloc(index, alloc, txn); err != nil { return err } if err := txn.Insert("allocs", alloc); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } if alloc.PreviousAllocation != "" { prevAlloc, err := txn.First("allocs", "id", alloc.PreviousAllocation) if err != nil { return fmt.Errorf("alloc lookup failed: %v", err) } existingPrevAlloc, _ := prevAlloc.(*structs.Allocation) if existingPrevAlloc != nil { prevAllocCopy := existingPrevAlloc.Copy() prevAllocCopy.NextAllocation = alloc.ID prevAllocCopy.ModifyIndex = index if err := txn.Insert("allocs", prevAllocCopy); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } } } // If the allocation is running, force the job to running status. forceStatus := "" if !alloc.TerminalStatus() { forceStatus = structs.JobStatusRunning } tuple := structs.NamespacedID{ ID: alloc.JobID, Namespace: alloc.Namespace, } jobs[tuple] = forceStatus } // Update the indexes if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // Set the job's status if err := s.setJobStatuses(index, txn, jobs, false); err != nil { return fmt.Errorf("setting job status failed: %v", err) } return nil } // UpdateAllocsDesiredTransitions is used to update a set of allocations // desired transitions. func (s *StateStore) UpdateAllocsDesiredTransitions(msgType structs.MessageType, index uint64, allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Handle each of the updated allocations for id, transition := range allocs { if err := s.UpdateAllocDesiredTransitionTxn(txn, index, id, transition); err != nil { return err } } for _, eval := range evals { if err := s.nestedUpsertEval(txn, index, eval); err != nil { return err } } // Update the indexes if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // UpdateAllocDesiredTransitionTxn is used to nest an update of an // allocations desired transition func (s *StateStore) UpdateAllocDesiredTransitionTxn( txn *txn, index uint64, allocID string, transition *structs.DesiredTransition) error { // Look for existing alloc existing, err := txn.First("allocs", "id", allocID) if err != nil { return fmt.Errorf("alloc lookup failed: %v", err) } // Nothing to do if this does not exist if existing == nil { return nil } exist := existing.(*structs.Allocation) // Copy everything from the existing allocation copyAlloc := exist.Copy() // Merge the desired transitions copyAlloc.DesiredTransition.Merge(transition) // Update the modify indexes copyAlloc.ModifyIndex = index copyAlloc.AllocModifyIndex = index // Update the allocation if err := txn.Insert("allocs", copyAlloc); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } return nil } // AllocByID is used to lookup an allocation by its ID func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) { txn := s.db.ReadTxn() return s.allocByIDImpl(txn, ws, id) } // allocByIDImpl retrives an allocation and is called under and existing // transaction. An optional watch set can be passed to add allocations to the // watch set func (s *StateStore) allocByIDImpl(txn Txn, ws memdb.WatchSet, id string) (*structs.Allocation, error) { watchCh, raw, err := txn.FirstWatch("allocs", "id", id) if err != nil { return nil, fmt.Errorf("alloc lookup failed: %v", err) } ws.Add(watchCh) if raw == nil { return nil, nil } alloc := raw.(*structs.Allocation) return alloc, nil } // AllocsByIDPrefix is used to lookup allocs by prefix func (s *StateStore) AllocsByIDPrefix(ws memdb.WatchSet, namespace, id string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var iter memdb.ResultIterator var err error switch sort { case SortReverse: iter, err = txn.GetReverse("allocs", "id_prefix", id) default: iter, err = txn.Get("allocs", "id_prefix", id) } if err != nil { return nil, fmt.Errorf("alloc lookup failed: %v", err) } ws.Add(iter.WatchCh()) // Wrap the iterator in a filter wrap := memdb.NewFilterIterator(iter, allocNamespaceFilter(namespace)) return wrap, nil } // allocNamespaceFilter returns a filter function that filters all allocations // not in the given namespace. func allocNamespaceFilter(namespace string) func(interface{}) bool { return func(raw interface{}) bool { alloc, ok := raw.(*structs.Allocation) if !ok { return true } if namespace == structs.AllNamespacesSentinel { return false } return alloc.Namespace != namespace } } // AllocsByIDPrefixAllNSs is used to lookup allocs by prefix. func (s *StateStore) AllocsByIDPrefixAllNSs(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("allocs", "id_prefix", prefix) if err != nil { return nil, fmt.Errorf("alloc lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // AllocsByNode returns all the allocations by node func (s *StateStore) AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error) { txn := s.db.ReadTxn() return allocsByNodeTxn(txn, ws, node) } func allocsByNodeTxn(txn ReadTxn, ws memdb.WatchSet, node string) ([]*structs.Allocation, error) { // Get an iterator over the node allocations, using only the // node prefix which ignores the terminal status iter, err := txn.Get("allocs", "node_prefix", node) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Allocation for { raw := iter.Next() if raw == nil { break } out = append(out, raw.(*structs.Allocation)) } return out, nil } // AllocsByNodeTerminal returns all the allocations by node and terminal // status. func (s *StateStore) AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error) { txn := s.db.ReadTxn() // Get an iterator over the node allocations iter, err := txn.Get("allocs", "node", node, terminal) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Allocation for { raw := iter.Next() if raw == nil { break } out = append(out, raw.(*structs.Allocation)) } return out, nil } // AllocsByJob returns allocations by job id func (s *StateStore) AllocsByJob(ws memdb.WatchSet, namespace, jobID string, anyCreateIndex bool) ([]*structs.Allocation, error) { txn := s.db.ReadTxn() // Get the job var job *structs.Job rawJob, err := txn.First("jobs", "id", namespace, jobID) if err != nil { return nil, err } if rawJob != nil { job = rawJob.(*structs.Job) } // Get an iterator over the node allocations iter, err := txn.Get("allocs", "job", namespace, jobID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Allocation for { raw := iter.Next() if raw == nil { break } alloc := raw.(*structs.Allocation) // If the allocation belongs to a job with the same ID but a different // create index and we are not getting all the allocations whose Jobs // matches the same Job ID then we skip it if !anyCreateIndex && job != nil && alloc.Job.CreateIndex != job.CreateIndex { continue } out = append(out, raw.(*structs.Allocation)) } return out, nil } // AllocsByEval returns all the allocations by eval id func (s *StateStore) AllocsByEval(ws memdb.WatchSet, evalID string) ([]*structs.Allocation, error) { txn := s.db.ReadTxn() // Get an iterator over the eval allocations iter, err := txn.Get("allocs", "eval", evalID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Allocation for { raw := iter.Next() if raw == nil { break } out = append(out, raw.(*structs.Allocation)) } return out, nil } // AllocsByDeployment returns all the allocations by deployment id func (s *StateStore) AllocsByDeployment(ws memdb.WatchSet, deploymentID string) ([]*structs.Allocation, error) { txn := s.db.ReadTxn() // Get an iterator over the deployments allocations iter, err := txn.Get("allocs", "deployment", deploymentID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.Allocation for { raw := iter.Next() if raw == nil { break } out = append(out, raw.(*structs.Allocation)) } return out, nil } // Allocs returns an iterator over all the evaluations. func (s *StateStore) Allocs(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var it memdb.ResultIterator var err error switch sort { case SortReverse: it, err = txn.GetReverse("allocs", "create") default: it, err = txn.Get("allocs", "create") } if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } func (s *StateStore) AllocsByNamespaceOrdered(ws memdb.WatchSet, namespace string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var ( it memdb.ResultIterator err error exact = terminate(namespace) ) switch sort { case SortReverse: it, err = txn.GetReverse("allocs", "namespace_create_prefix", exact) default: it, err = txn.Get("allocs", "namespace_create_prefix", exact) } if err != nil { return nil, err } ws.Add(it.WatchCh()) return it, nil } // AllocsByNamespace returns an iterator over all the allocations in the // namespace func (s *StateStore) AllocsByNamespace(ws memdb.WatchSet, namespace string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() return s.allocsByNamespaceImpl(ws, txn, namespace) } // allocsByNamespaceImpl returns an iterator over all the allocations in the // namespace func (s *StateStore) allocsByNamespaceImpl(ws memdb.WatchSet, txn *txn, namespace string) (memdb.ResultIterator, error) { // Walk the entire table iter, err := txn.Get("allocs", "namespace", namespace) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // UpsertVaultAccessor is used to register a set of Vault Accessors. func (s *StateStore) UpsertVaultAccessor(index uint64, accessors []*structs.VaultAccessor) error { txn := s.db.WriteTxn(index) defer txn.Abort() for _, accessor := range accessors { // Set the create index accessor.CreateIndex = index // Insert the accessor if err := txn.Insert("vault_accessors", accessor); err != nil { return fmt.Errorf("accessor insert failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteVaultAccessors is used to delete a set of Vault Accessors func (s *StateStore) DeleteVaultAccessors(index uint64, accessors []*structs.VaultAccessor) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Lookup the accessor for _, accessor := range accessors { // Delete the accessor if err := txn.Delete("vault_accessors", accessor); err != nil { return fmt.Errorf("accessor delete failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"vault_accessors", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // VaultAccessor returns the given Vault accessor func (s *StateStore) VaultAccessor(ws memdb.WatchSet, accessor string) (*structs.VaultAccessor, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("vault_accessors", "id", accessor) if err != nil { return nil, fmt.Errorf("accessor lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.VaultAccessor), nil } return nil, nil } // VaultAccessors returns an iterator of Vault accessors. func (s *StateStore) VaultAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("vault_accessors", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // VaultAccessorsByAlloc returns all the Vault accessors by alloc id func (s *StateStore) VaultAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.VaultAccessor, error) { txn := s.db.ReadTxn() // Get an iterator over the accessors iter, err := txn.Get("vault_accessors", "alloc_id", allocID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.VaultAccessor for { raw := iter.Next() if raw == nil { break } out = append(out, raw.(*structs.VaultAccessor)) } return out, nil } // VaultAccessorsByNode returns all the Vault accessors by node id func (s *StateStore) VaultAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.VaultAccessor, error) { txn := s.db.ReadTxn() // Get an iterator over the accessors iter, err := txn.Get("vault_accessors", "node_id", nodeID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var out []*structs.VaultAccessor for { raw := iter.Next() if raw == nil { break } out = append(out, raw.(*structs.VaultAccessor)) } return out, nil } func indexEntry(table string, index uint64) *IndexEntry { return &IndexEntry{ Key: table, Value: index, } } const siTokenAccessorTable = "si_token_accessors" // UpsertSITokenAccessors is used to register a set of Service Identity token accessors. func (s *StateStore) UpsertSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error { txn := s.db.WriteTxn(index) defer txn.Abort() for _, accessor := range accessors { // set the create index accessor.CreateIndex = index // insert the accessor if err := txn.Insert(siTokenAccessorTable, accessor); err != nil { return fmt.Errorf("accessor insert failed: %w", err) } } // update the index for this table if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil { return fmt.Errorf("index update failed: %w", err) } return txn.Commit() } // DeleteSITokenAccessors is used to delete a set of Service Identity token accessors. func (s *StateStore) DeleteSITokenAccessors(index uint64, accessors []*structs.SITokenAccessor) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Lookup each accessor for _, accessor := range accessors { // Delete the accessor if err := txn.Delete(siTokenAccessorTable, accessor); err != nil { return fmt.Errorf("accessor delete failed: %w", err) } } // update the index for this table if err := txn.Insert("index", indexEntry(siTokenAccessorTable, index)); err != nil { return fmt.Errorf("index update failed: %w", err) } return txn.Commit() } // SITokenAccessor returns the given Service Identity token accessor. func (s *StateStore) SITokenAccessor(ws memdb.WatchSet, accessorID string) (*structs.SITokenAccessor, error) { txn := s.db.ReadTxn() defer txn.Abort() watchCh, existing, err := txn.FirstWatch(siTokenAccessorTable, "id", accessorID) if err != nil { return nil, fmt.Errorf("accessor lookup failed: %w", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.SITokenAccessor), nil } return nil, nil } // SITokenAccessors returns an iterator of Service Identity token accessors. func (s *StateStore) SITokenAccessors(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() defer txn.Abort() iter, err := txn.Get(siTokenAccessorTable, "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // SITokenAccessorsByAlloc returns all the Service Identity token accessors by alloc ID. func (s *StateStore) SITokenAccessorsByAlloc(ws memdb.WatchSet, allocID string) ([]*structs.SITokenAccessor, error) { txn := s.db.ReadTxn() defer txn.Abort() // Get an iterator over the accessors iter, err := txn.Get(siTokenAccessorTable, "alloc_id", allocID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var result []*structs.SITokenAccessor for raw := iter.Next(); raw != nil; raw = iter.Next() { result = append(result, raw.(*structs.SITokenAccessor)) } return result, nil } // SITokenAccessorsByNode returns all the Service Identity token accessors by node ID. func (s *StateStore) SITokenAccessorsByNode(ws memdb.WatchSet, nodeID string) ([]*structs.SITokenAccessor, error) { txn := s.db.ReadTxn() defer txn.Abort() // Get an iterator over the accessors iter, err := txn.Get(siTokenAccessorTable, "node_id", nodeID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) var result []*structs.SITokenAccessor for raw := iter.Next(); raw != nil; raw = iter.Next() { result = append(result, raw.(*structs.SITokenAccessor)) } return result, nil } // UpdateDeploymentStatus is used to make deployment status updates and // potentially make a evaluation func (s *StateStore) UpdateDeploymentStatus(msgType structs.MessageType, index uint64, req *structs.DeploymentStatusUpdateRequest) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil { return err } // Upsert the job if necessary if req.Job != nil { if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil { return err } } // Upsert the optional eval if req.Eval != nil { if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil { return err } } return txn.Commit() } // updateDeploymentStatusImpl is used to make deployment status updates func (s *StateStore) updateDeploymentStatusImpl(index uint64, u *structs.DeploymentStatusUpdate, txn *txn) error { // Retrieve deployment ws := memdb.NewWatchSet() deployment, err := s.deploymentByIDImpl(ws, u.DeploymentID, txn) if err != nil { return err } else if deployment == nil { return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", u.DeploymentID) } else if !deployment.Active() { return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status) } // Apply the new status copy := deployment.Copy() copy.Status = u.Status copy.StatusDescription = u.StatusDescription copy.ModifyIndex = index // Insert the deployment if err := txn.Insert("deployment", copy); err != nil { return err } // Update the index if err := txn.Insert("index", &IndexEntry{"deployment", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // If the deployment is being marked as complete, set the job to stable. if copy.Status == structs.DeploymentStatusSuccessful { if err := s.updateJobStabilityImpl(index, copy.Namespace, copy.JobID, copy.JobVersion, true, txn); err != nil { return fmt.Errorf("failed to update job stability: %v", err) } } return nil } // UpdateJobStability updates the stability of the given job and version to the // desired status. func (s *StateStore) UpdateJobStability(index uint64, namespace, jobID string, jobVersion uint64, stable bool) error { txn := s.db.WriteTxn(index) defer txn.Abort() if err := s.updateJobStabilityImpl(index, namespace, jobID, jobVersion, stable, txn); err != nil { return err } return txn.Commit() } // updateJobStabilityImpl updates the stability of the given job and version func (s *StateStore) updateJobStabilityImpl(index uint64, namespace, jobID string, jobVersion uint64, stable bool, txn *txn) error { // Get the job that is referenced job, err := s.jobByIDAndVersionImpl(nil, namespace, jobID, jobVersion, txn) if err != nil { return err } // Has already been cleared, nothing to do if job == nil { return nil } // If the job already has the desired stability, nothing to do if job.Stable == stable { return nil } copy := job.Copy() copy.Stable = stable return s.upsertJobImpl(index, copy, true, txn) } // UpdateDeploymentPromotion is used to promote canaries in a deployment and // potentially make a evaluation func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentPromoteRequest) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Retrieve deployment and ensure it is not terminal and is active ws := memdb.NewWatchSet() deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn) if err != nil { return err } else if deployment == nil { return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID) } else if !deployment.Active() { return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status) } // Retrieve effected allocations iter, err := txn.Get("allocs", "deployment", req.DeploymentID) if err != nil { return err } // groupIndex is a map of groups being promoted groupIndex := make(map[string]struct{}, len(req.Groups)) for _, g := range req.Groups { groupIndex[g] = struct{}{} } // canaryIndex is the set of placed canaries in the deployment canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups)) for _, dstate := range deployment.TaskGroups { for _, c := range dstate.PlacedCanaries { canaryIndex[c] = struct{}{} } } // healthyCounts is a mapping of group to the number of healthy canaries healthyCounts := make(map[string]int, len(deployment.TaskGroups)) // promotable is the set of allocations that we can move from canary to // non-canary var promotable []*structs.Allocation for { raw := iter.Next() if raw == nil { break } alloc := raw.(*structs.Allocation) // Check that the alloc is a canary if _, ok := canaryIndex[alloc.ID]; !ok { continue } // Check that the canary is part of a group being promoted if _, ok := groupIndex[alloc.TaskGroup]; !req.All && !ok { continue } // Ensure the canaries are healthy if alloc.TerminalStatus() || !alloc.DeploymentStatus.IsHealthy() { continue } healthyCounts[alloc.TaskGroup]++ promotable = append(promotable, alloc) } // Determine if we have enough healthy allocations var unhealthyErr multierror.Error for tg, dstate := range deployment.TaskGroups { if _, ok := groupIndex[tg]; !req.All && !ok { continue } need := dstate.DesiredCanaries if need == 0 { continue } if have := healthyCounts[tg]; have < need { multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need)) } } if err := unhealthyErr.ErrorOrNil(); err != nil { return err } // Update deployment copy := deployment.Copy() copy.ModifyIndex = index for tg, status := range copy.TaskGroups { _, ok := groupIndex[tg] if !req.All && !ok { continue } // reset the progress deadline if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() { status.RequireProgressBy = time.Now().Add(status.ProgressDeadline) } status.Promoted = true } // If the deployment no longer needs promotion, update its status if !copy.RequiresPromotion() && copy.Status == structs.DeploymentStatusRunning { copy.StatusDescription = structs.DeploymentStatusDescriptionRunning } // Insert the deployment if err := s.upsertDeploymentImpl(index, copy, txn); err != nil { return err } // Upsert the optional eval if req.Eval != nil { if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil { return err } } // For each promotable allocation remove the canary field for _, alloc := range promotable { promoted := alloc.Copy() promoted.DeploymentStatus.Canary = false promoted.DeploymentStatus.ModifyIndex = index promoted.ModifyIndex = index promoted.AllocModifyIndex = index if err := txn.Insert("allocs", promoted); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } } // Update the alloc index if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // UpdateDeploymentAllocHealth is used to update the health of allocations as // part of the deployment and potentially make a evaluation func (s *StateStore) UpdateDeploymentAllocHealth(msgType structs.MessageType, index uint64, req *structs.ApplyDeploymentAllocHealthRequest) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Retrieve deployment and ensure it is not terminal and is active ws := memdb.NewWatchSet() deployment, err := s.deploymentByIDImpl(ws, req.DeploymentID, txn) if err != nil { return err } else if deployment == nil { return fmt.Errorf("Deployment ID %q couldn't be updated as it does not exist", req.DeploymentID) } else if !deployment.Active() { return fmt.Errorf("Deployment %q has terminal status %q:", deployment.ID, deployment.Status) } // Update the health status of each allocation if total := len(req.HealthyAllocationIDs) + len(req.UnhealthyAllocationIDs); total != 0 { setAllocHealth := func(id string, healthy bool, ts time.Time) error { existing, err := txn.First("allocs", "id", id) if err != nil { return fmt.Errorf("alloc %q lookup failed: %v", id, err) } if existing == nil { return fmt.Errorf("unknown alloc %q", id) } old := existing.(*structs.Allocation) if old.DeploymentID != req.DeploymentID { return fmt.Errorf("alloc %q is not part of deployment %q", id, req.DeploymentID) } // Set the health copy := old.Copy() if copy.DeploymentStatus == nil { copy.DeploymentStatus = &structs.AllocDeploymentStatus{} } copy.DeploymentStatus.Healthy = pointer.Of(healthy) copy.DeploymentStatus.Timestamp = ts copy.DeploymentStatus.ModifyIndex = index copy.ModifyIndex = index if err := s.updateDeploymentWithAlloc(index, copy, old, txn); err != nil { return fmt.Errorf("error updating deployment: %v", err) } if err := txn.Insert("allocs", copy); err != nil { return fmt.Errorf("alloc insert failed: %v", err) } return nil } for _, id := range req.HealthyAllocationIDs { if err := setAllocHealth(id, true, req.Timestamp); err != nil { return err } } for _, id := range req.UnhealthyAllocationIDs { if err := setAllocHealth(id, false, req.Timestamp); err != nil { return err } } // Update the indexes if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } // Update the deployment status as needed. if req.DeploymentUpdate != nil { if err := s.updateDeploymentStatusImpl(index, req.DeploymentUpdate, txn); err != nil { return err } } // Upsert the job if necessary if req.Job != nil { if err := s.upsertJobImpl(index, req.Job, false, txn); err != nil { return err } } // Upsert the optional eval if req.Eval != nil { if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil { return err } } return txn.Commit() } // LatestIndex returns the greatest index value for all indexes. func (s *StateStore) LatestIndex() (uint64, error) { indexes, err := s.Indexes() if err != nil { return 0, err } var max uint64 = 0 for { raw := indexes.Next() if raw == nil { break } // Prepare the request struct idx := raw.(*IndexEntry) // Determine the max if idx.Value > max { max = idx.Value } } return max, nil } // Index finds the matching index value func (s *StateStore) Index(name string) (uint64, error) { txn := s.db.ReadTxn() // Lookup the first matching index out, err := txn.First("index", "id", name) if err != nil { return 0, err } if out == nil { return 0, nil } return out.(*IndexEntry).Value, nil } // Indexes returns an iterator over all the indexes func (s *StateStore) Indexes() (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire nodes table iter, err := txn.Get("index", "id") if err != nil { return nil, err } return iter, nil } // ReconcileJobSummaries re-creates summaries for all jobs present in the state // store func (s *StateStore) ReconcileJobSummaries(index uint64) error { txn := s.db.WriteTxn(index) defer txn.Abort() // Get all the jobs iter, err := txn.Get("jobs", "id") if err != nil { return err } // COMPAT: Remove after 0.11 // Iterate over jobs to build a list of parent jobs and their children parentMap := make(map[string][]*structs.Job) for { rawJob := iter.Next() if rawJob == nil { break } job := rawJob.(*structs.Job) if job.ParentID != "" { children := parentMap[job.ParentID] children = append(children, job) parentMap[job.ParentID] = children } } // Get all the jobs again iter, err = txn.Get("jobs", "id") if err != nil { return err } for { rawJob := iter.Next() if rawJob == nil { break } job := rawJob.(*structs.Job) if job.IsParameterized() || job.IsPeriodic() { // COMPAT: Remove after 0.11 // The following block of code fixes incorrect child summaries due to a bug // See https://github.com/hashicorp/nomad/issues/3886 for details rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID) if err != nil { return err } if rawSummary == nil { continue } oldSummary := rawSummary.(*structs.JobSummary) // Create an empty summary summary := &structs.JobSummary{ JobID: job.ID, Namespace: job.Namespace, Summary: make(map[string]structs.TaskGroupSummary), Children: &structs.JobChildrenSummary{}, } // Iterate over children of this job if any to fix summary counts children := parentMap[job.ID] for _, childJob := range children { switch childJob.Status { case structs.JobStatusPending: summary.Children.Pending++ case structs.JobStatusDead: summary.Children.Dead++ case structs.JobStatusRunning: summary.Children.Running++ } } // Insert the job summary if its different if !reflect.DeepEqual(summary, oldSummary) { // Set the create index of the summary same as the job's create index // and the modify index to the current index summary.CreateIndex = job.CreateIndex summary.ModifyIndex = index if err := txn.Insert("job_summary", summary); err != nil { return fmt.Errorf("error inserting job summary: %v", err) } } // Done with handling a parent job, continue to next continue } // Create a job summary for the job summary := &structs.JobSummary{ JobID: job.ID, Namespace: job.Namespace, Summary: make(map[string]structs.TaskGroupSummary), } for _, tg := range job.TaskGroups { summary.Summary[tg.Name] = structs.TaskGroupSummary{} } // Find all the allocations for the jobs iterAllocs, err := txn.Get("allocs", "job", job.Namespace, job.ID) if err != nil { return err } // Calculate the summary for the job for { rawAlloc := iterAllocs.Next() if rawAlloc == nil { break } alloc := rawAlloc.(*structs.Allocation) // Ignore the allocation if it doesn't belong to the currently // registered job. The allocation is checked because of issue #2304 if alloc.Job == nil || alloc.Job.CreateIndex != job.CreateIndex { continue } tg := summary.Summary[alloc.TaskGroup] switch alloc.ClientStatus { case structs.AllocClientStatusFailed: tg.Failed += 1 case structs.AllocClientStatusLost: tg.Lost += 1 case structs.AllocClientStatusUnknown: tg.Unknown += 1 case structs.AllocClientStatusComplete: tg.Complete += 1 case structs.AllocClientStatusRunning: tg.Running += 1 case structs.AllocClientStatusPending: tg.Starting += 1 default: s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID) } summary.Summary[alloc.TaskGroup] = tg } // Set the create index of the summary same as the job's create index // and the modify index to the current index summary.CreateIndex = job.CreateIndex summary.ModifyIndex = index // Insert the job summary if err := txn.Insert("job_summary", summary); err != nil { return fmt.Errorf("error inserting job summary: %v", err) } } // Update the indexes table for job summary if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // setJobStatuses is a helper for calling setJobStatus on multiple jobs by ID. // It takes a map of job IDs to an optional forceStatus string. It returns an // error if the job doesn't exist or setJobStatus fails. func (s *StateStore) setJobStatuses(index uint64, txn *txn, jobs map[structs.NamespacedID]string, evalDelete bool) error { for tuple, forceStatus := range jobs { existing, err := txn.First("jobs", "id", tuple.Namespace, tuple.ID) if err != nil { return fmt.Errorf("job lookup failed: %v", err) } if existing == nil { continue } if err := s.setJobStatus(index, txn, existing.(*structs.Job), evalDelete, forceStatus); err != nil { return err } } return nil } // setJobStatus sets the status of the job by looking up associated evaluations // and allocations. evalDelete should be set to true if setJobStatus is being // called because an evaluation is being deleted (potentially because of garbage // collection). If forceStatus is non-empty, the job's status will be set to the // passed status. func (s *StateStore) setJobStatus(index uint64, txn *txn, job *structs.Job, evalDelete bool, forceStatus string) error { // Capture the current status so we can check if there is a change oldStatus := job.Status newStatus := forceStatus // If forceStatus is not set, compute the jobs status. if forceStatus == "" { var err error newStatus, err = s.getJobStatus(txn, job, evalDelete) if err != nil { return err } } // Fast-path if the job has not changed. if oldStatus == newStatus { return nil } // Copy and update the existing job updated := job.Copy() updated.Status = newStatus updated.ModifyIndex = index // Insert the job if err := txn.Insert("jobs", updated); err != nil { return fmt.Errorf("job insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"jobs", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } // Update the children summary if err := s.setJobSummary(txn, updated, index, oldStatus, newStatus); err != nil { return fmt.Errorf("job summary update failed %w", err) } return nil } func (s *StateStore) setJobSummary(txn *txn, updated *structs.Job, index uint64, oldStatus, newStatus string) error { if updated.ParentID == "" { return nil } // Try to update the summary of the parent job summary summaryRaw, err := txn.First("job_summary", "id", updated.Namespace, updated.ParentID) if err != nil { return fmt.Errorf("unable to retrieve summary for parent job: %v", err) } // Only continue if the summary exists. It could not exist if the parent // job was removed if summaryRaw != nil { existing := summaryRaw.(*structs.JobSummary) pSummary := existing.Copy() if pSummary.Children == nil { pSummary.Children = new(structs.JobChildrenSummary) } // Determine the transition and update the correct fields children := pSummary.Children // Decrement old status if oldStatus != "" { switch oldStatus { case structs.JobStatusPending: children.Pending-- case structs.JobStatusRunning: children.Running-- case structs.JobStatusDead: children.Dead-- default: return fmt.Errorf("unknown old job status %q", oldStatus) } } // Increment new status switch newStatus { case structs.JobStatusPending: children.Pending++ case structs.JobStatusRunning: children.Running++ case structs.JobStatusDead: children.Dead++ default: return fmt.Errorf("unknown new job status %q", newStatus) } // Update the index pSummary.ModifyIndex = index // Insert the summary if err := txn.Insert("job_summary", pSummary); err != nil { return fmt.Errorf("job summary insert failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } return nil } func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) (string, error) { // System, Periodic and Parameterized jobs are running until explicitly // stopped. if job.Type == structs.JobTypeSystem || job.IsParameterized() || job.IsPeriodic() { if job.Stop { return structs.JobStatusDead, nil } return structs.JobStatusRunning, nil } allocs, err := txn.Get("allocs", "job", job.Namespace, job.ID) if err != nil { return "", err } // If there is a non-terminal allocation, the job is running. hasAlloc := false for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() { hasAlloc = true if !alloc.(*structs.Allocation).TerminalStatus() { return structs.JobStatusRunning, nil } } evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID) if err != nil { return "", err } hasEval := false for raw := evals.Next(); raw != nil; raw = evals.Next() { e := raw.(*structs.Evaluation) // Filter non-exact matches if e.JobID != job.ID { continue } hasEval = true if !e.TerminalStatus() { return structs.JobStatusPending, nil } } // The job is dead if all the allocations and evals are terminal or if there // are no evals because of garbage collection. if evalDelete || hasEval || hasAlloc { return structs.JobStatusDead, nil } return structs.JobStatusPending, nil } // updateSummaryWithJob creates or updates job summaries when new jobs are // upserted or existing ones are updated func (s *StateStore) updateSummaryWithJob(index uint64, job *structs.Job, txn *txn) error { // Update the job summary summaryRaw, err := txn.First("job_summary", "id", job.Namespace, job.ID) if err != nil { return fmt.Errorf("job summary lookup failed: %v", err) } // Get the summary or create if necessary var summary *structs.JobSummary hasSummaryChanged := false if summaryRaw != nil { summary = summaryRaw.(*structs.JobSummary).Copy() } else { summary = &structs.JobSummary{ JobID: job.ID, Namespace: job.Namespace, Summary: make(map[string]structs.TaskGroupSummary), Children: new(structs.JobChildrenSummary), CreateIndex: index, } hasSummaryChanged = true } for _, tg := range job.TaskGroups { if _, ok := summary.Summary[tg.Name]; !ok { newSummary := structs.TaskGroupSummary{ Complete: 0, Failed: 0, Running: 0, Starting: 0, } summary.Summary[tg.Name] = newSummary hasSummaryChanged = true } } // The job summary has changed, so update the modify index. if hasSummaryChanged { summary.ModifyIndex = index // Update the indexes table for job summary if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } if err := txn.Insert("job_summary", summary); err != nil { return err } } return nil } // updateJobScalingPolicies upserts any scaling policies contained in the job and removes // any previous scaling policies that were removed from the job func (s *StateStore) updateJobScalingPolicies(index uint64, job *structs.Job, txn *txn) error { ws := memdb.NewWatchSet() scalingPolicies := job.GetScalingPolicies() newTargets := map[string]bool{} for _, p := range scalingPolicies { newTargets[p.JobKey()] = true } // find existing policies that need to be deleted deletedPolicies := []string{} iter, err := s.ScalingPoliciesByJobTxn(ws, job.Namespace, job.ID, txn) if err != nil { return fmt.Errorf("ScalingPoliciesByJob lookup failed: %v", err) } for raw := iter.Next(); raw != nil; raw = iter.Next() { oldPolicy := raw.(*structs.ScalingPolicy) if !newTargets[oldPolicy.JobKey()] { deletedPolicies = append(deletedPolicies, oldPolicy.ID) } } err = s.DeleteScalingPoliciesTxn(index, deletedPolicies, txn) if err != nil { return fmt.Errorf("DeleteScalingPolicies of removed policies failed: %v", err) } err = s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn) if err != nil { return fmt.Errorf("UpsertScalingPolicies of policies failed: %v", err) } return nil } // updateJobCSIPlugins runs on job update, and indexes the job in the plugin func (s *StateStore) updateJobCSIPlugins(index uint64, job, prev *structs.Job, txn *txn) error { plugIns := make(map[string]*structs.CSIPlugin) upsertFn := func(job *structs.Job, delete bool) error { for _, tg := range job.TaskGroups { for _, t := range tg.Tasks { if t.CSIPluginConfig == nil { continue } plugIn, ok := plugIns[t.CSIPluginConfig.ID] if !ok { p, err := s.CSIPluginByIDTxn(txn, nil, t.CSIPluginConfig.ID) if err != nil { return err } if p == nil { plugIn = structs.NewCSIPlugin(t.CSIPluginConfig.ID, index) } else { plugIn = p.Copy() plugIn.ModifyIndex = index } plugIns[plugIn.ID] = plugIn } if delete { plugIn.DeleteJob(job, nil) } else { plugIn.AddJob(job, nil) } } } return nil } if prev != nil { err := upsertFn(prev, true) if err != nil { return err } } err := upsertFn(job, false) if err != nil { return err } for _, plugIn := range plugIns { err = txn.Insert("csi_plugins", plugIn) if err != nil { return fmt.Errorf("csi_plugins insert error: %v", err) } } if err := txn.Insert("index", &IndexEntry{"csi_plugins", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // updateDeploymentWithAlloc is used to update the deployment state associated // with the given allocation. The passed alloc may be updated if the deployment // status has changed to capture the modify index at which it has changed. func (s *StateStore) updateDeploymentWithAlloc(index uint64, alloc, existing *structs.Allocation, txn *txn) error { // Nothing to do if the allocation is not associated with a deployment if alloc.DeploymentID == "" { return nil } // Get the deployment ws := memdb.NewWatchSet() deployment, err := s.deploymentByIDImpl(ws, alloc.DeploymentID, txn) if err != nil { return err } if deployment == nil { return nil } // Retrieve the deployment state object _, ok := deployment.TaskGroups[alloc.TaskGroup] if !ok { // If the task group isn't part of the deployment, the task group wasn't // part of a rolling update so nothing to do return nil } // Do not modify in-place. Instead keep track of what must be done placed := 0 healthy := 0 unhealthy := 0 // If there was no existing allocation, this is a placement and we increment // the placement existingHealthSet := existing != nil && existing.DeploymentStatus.HasHealth() allocHealthSet := alloc.DeploymentStatus.HasHealth() if existing == nil || existing.DeploymentID != alloc.DeploymentID { placed++ } else if !existingHealthSet && allocHealthSet { if *alloc.DeploymentStatus.Healthy { healthy++ } else { unhealthy++ } } else if existingHealthSet && allocHealthSet { // See if it has gone from healthy to unhealthy if *existing.DeploymentStatus.Healthy && !*alloc.DeploymentStatus.Healthy { healthy-- unhealthy++ } } // Nothing to do if placed == 0 && healthy == 0 && unhealthy == 0 { return nil } // Update the allocation's deployment status modify index if alloc.DeploymentStatus != nil && healthy+unhealthy != 0 { alloc.DeploymentStatus.ModifyIndex = index } // Create a copy of the deployment object deploymentCopy := deployment.Copy() deploymentCopy.ModifyIndex = index dstate := deploymentCopy.TaskGroups[alloc.TaskGroup] dstate.PlacedAllocs += placed dstate.HealthyAllocs += healthy dstate.UnhealthyAllocs += unhealthy // Ensure PlacedCanaries accurately reflects the alloc canary status if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary { found := false for _, canary := range dstate.PlacedCanaries { if alloc.ID == canary { found = true break } } if !found { dstate.PlacedCanaries = append(dstate.PlacedCanaries, alloc.ID) } } // Update the progress deadline if pd := dstate.ProgressDeadline; pd != 0 { // If we are the first placed allocation for the deployment start the progress deadline. if placed != 0 && dstate.RequireProgressBy.IsZero() { // Use modify time instead of create time because we may in-place // update the allocation to be part of a new deployment. dstate.RequireProgressBy = time.Unix(0, alloc.ModifyTime).Add(pd) } else if healthy != 0 { if d := alloc.DeploymentStatus.Timestamp.Add(pd); d.After(dstate.RequireProgressBy) { dstate.RequireProgressBy = d } } } // Upsert the deployment if err := s.upsertDeploymentImpl(index, deploymentCopy, txn); err != nil { return err } return nil } // updateSummaryWithAlloc updates the job summary when allocations are updated // or inserted func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocation, existingAlloc *structs.Allocation, txn *txn) error { // We don't have to update the summary if the job is missing if alloc.Job == nil { return nil } summaryRaw, err := txn.First("job_summary", "id", alloc.Namespace, alloc.JobID) if err != nil { return fmt.Errorf("unable to lookup job summary for job id %q in namespace %q: %v", alloc.JobID, alloc.Namespace, err) } if summaryRaw == nil { // Check if the job is de-registered rawJob, err := txn.First("jobs", "id", alloc.Namespace, alloc.JobID) if err != nil { return fmt.Errorf("unable to query job: %v", err) } // If the job is de-registered then we skip updating it's summary if rawJob == nil { return nil } return fmt.Errorf("job summary for job %q in namespace %q is not present", alloc.JobID, alloc.Namespace) } // Get a copy of the existing summary jobSummary := summaryRaw.(*structs.JobSummary).Copy() // Not updating the job summary because the allocation doesn't belong to the // currently registered job if jobSummary.CreateIndex != alloc.Job.CreateIndex { return nil } tgSummary, ok := jobSummary.Summary[alloc.TaskGroup] if !ok { return fmt.Errorf("unable to find task group in the job summary: %v", alloc.TaskGroup) } summaryChanged := false if existingAlloc == nil { switch alloc.DesiredStatus { case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: s.logger.Error("new allocation inserted into state store with bad desired status", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) } switch alloc.ClientStatus { case structs.AllocClientStatusPending: tgSummary.Starting += 1 if tgSummary.Queued > 0 { tgSummary.Queued -= 1 } summaryChanged = true case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: s.logger.Error("new allocation inserted into state store with bad client status", "alloc_id", alloc.ID, "client_status", alloc.ClientStatus) } } else if existingAlloc.ClientStatus != alloc.ClientStatus { // Incrementing the client of the bin of the current state switch alloc.ClientStatus { case structs.AllocClientStatusRunning: tgSummary.Running += 1 case structs.AllocClientStatusFailed: tgSummary.Failed += 1 case structs.AllocClientStatusPending: tgSummary.Starting += 1 case structs.AllocClientStatusComplete: tgSummary.Complete += 1 case structs.AllocClientStatusLost: tgSummary.Lost += 1 case structs.AllocClientStatusUnknown: tgSummary.Unknown += 1 } // Decrementing the count of the bin of the last state switch existingAlloc.ClientStatus { case structs.AllocClientStatusRunning: if tgSummary.Running > 0 { tgSummary.Running -= 1 } case structs.AllocClientStatusPending: if tgSummary.Starting > 0 { tgSummary.Starting -= 1 } case structs.AllocClientStatusLost: if tgSummary.Lost > 0 { tgSummary.Lost -= 1 } case structs.AllocClientStatusUnknown: if tgSummary.Unknown > 0 { tgSummary.Unknown -= 1 } case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: default: s.logger.Error("invalid old client status for allocation", "alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus) } summaryChanged = true } jobSummary.Summary[alloc.TaskGroup] = tgSummary if summaryChanged { jobSummary.ModifyIndex = index s.updatePluginWithJobSummary(index, jobSummary, alloc, txn) // Update the indexes table for job summary if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } if err := txn.Insert("job_summary", jobSummary); err != nil { return fmt.Errorf("updating job summary failed: %v", err) } } return nil } // updatePluginForTerminalAlloc updates the CSI plugins for an alloc when the // allocation is updated or inserted with a terminal server status. func (s *StateStore) updatePluginForTerminalAlloc(index uint64, alloc *structs.Allocation, txn *txn) error { if !alloc.ServerTerminalStatus() { return nil } tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) for _, t := range tg.Tasks { if t.CSIPluginConfig != nil { pluginID := t.CSIPluginConfig.ID plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID) if err != nil { return err } if plug == nil { // plugin may not have been created because it never // became healthy, just move on return nil } plug = plug.Copy() err = plug.DeleteAlloc(alloc.ID, alloc.NodeID) if err != nil { return err } err = updateOrGCPlugin(index, txn, plug) if err != nil { return err } } } return nil } // updatePluginWithJobSummary updates the CSI plugins for a job when the // job summary is updated by an alloc func (s *StateStore) updatePluginWithJobSummary(index uint64, summary *structs.JobSummary, alloc *structs.Allocation, txn *txn) error { tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if tg == nil { return nil } for _, t := range tg.Tasks { if t.CSIPluginConfig != nil { pluginID := t.CSIPluginConfig.ID plug, err := s.CSIPluginByIDTxn(txn, nil, pluginID) if err != nil { return err } if plug == nil { plug = structs.NewCSIPlugin(pluginID, index) } else { plug = plug.Copy() } plug.UpdateExpectedWithJob(alloc.Job, summary, alloc.Job.Status == structs.JobStatusDead) err = updateOrGCPlugin(index, txn, plug) if err != nil { return err } } } return nil } // UpsertACLPolicies is used to create or update a set of ACL policies func (s *StateStore) UpsertACLPolicies(msgType structs.MessageType, index uint64, policies []*structs.ACLPolicy) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() for _, policy := range policies { // Ensure the policy hash is non-nil. This should be done outside the state store // for performance reasons, but we check here for defense in depth. if len(policy.Hash) == 0 { policy.SetHash() } // Check if the policy already exists existing, err := txn.First("acl_policy", "id", policy.Name) if err != nil { return fmt.Errorf("policy lookup failed: %v", err) } // Update all the indexes if existing != nil { policy.CreateIndex = existing.(*structs.ACLPolicy).CreateIndex policy.ModifyIndex = index } else { policy.CreateIndex = index policy.ModifyIndex = index } // Update the policy if err := txn.Insert("acl_policy", policy); err != nil { return fmt.Errorf("upserting policy failed: %v", err) } } // Update the indexes tabl if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteACLPolicies deletes the policies with the given names func (s *StateStore) DeleteACLPolicies(msgType structs.MessageType, index uint64, names []string) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Delete the policy for _, name := range names { if _, err := txn.DeleteAll("acl_policy", "id", name); err != nil { return fmt.Errorf("deleting acl policy failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"acl_policy", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // ACLPolicyByName is used to lookup a policy by name func (s *StateStore) ACLPolicyByName(ws memdb.WatchSet, name string) (*structs.ACLPolicy, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("acl_policy", "id", name) if err != nil { return nil, fmt.Errorf("acl policy lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.ACLPolicy), nil } return nil, nil } // ACLPolicyByNamePrefix is used to lookup policies by prefix func (s *StateStore) ACLPolicyByNamePrefix(ws memdb.WatchSet, prefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("acl_policy", "id_prefix", prefix) if err != nil { return nil, fmt.Errorf("acl policy lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // ACLPolicyByJob is used to lookup policies that have been attached to a // specific job func (s *StateStore) ACLPolicyByJob(ws memdb.WatchSet, ns, jobID string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("acl_policy", "job_prefix", ns, jobID) if err != nil { return nil, fmt.Errorf("acl policy lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // ACLPolicies returns an iterator over all the acl policies func (s *StateStore) ACLPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire table iter, err := txn.Get("acl_policy", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // UpsertACLTokens is used to create or update a set of ACL tokens func (s *StateStore) UpsertACLTokens(msgType structs.MessageType, index uint64, tokens []*structs.ACLToken) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() for _, token := range tokens { // Ensure the policy hash is non-nil. This should be done outside the state store // for performance reasons, but we check here for defense in depth. if len(token.Hash) == 0 { token.SetHash() } // Check if the token already exists existing, err := txn.First("acl_token", "id", token.AccessorID) if err != nil { return fmt.Errorf("token lookup failed: %v", err) } // Update all the indexes if existing != nil { existTK := existing.(*structs.ACLToken) token.CreateIndex = existTK.CreateIndex token.ModifyIndex = index // Do not allow SecretID or create time to change token.SecretID = existTK.SecretID token.CreateTime = existTK.CreateTime } else { token.CreateIndex = index token.ModifyIndex = index } // Update the token if err := txn.Insert("acl_token", token); err != nil { return fmt.Errorf("upserting token failed: %v", err) } } // Update the indexes table if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteACLTokens deletes the tokens with the given accessor ids func (s *StateStore) DeleteACLTokens(msgType structs.MessageType, index uint64, ids []string) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Delete the tokens for _, id := range ids { if _, err := txn.DeleteAll("acl_token", "id", id); err != nil { return fmt.Errorf("deleting acl token failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // ACLTokenByAccessorID is used to lookup a token by accessor ID func (s *StateStore) ACLTokenByAccessorID(ws memdb.WatchSet, id string) (*structs.ACLToken, error) { if id == "" { return nil, fmt.Errorf("acl token lookup failed: missing accessor id") } txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("acl_token", "id", id) if err != nil { return nil, fmt.Errorf("acl token lookup failed: %v", err) } ws.Add(watchCh) // If the existing token is nil, this indicates it does not exist in state. if existing == nil { return nil, nil } // Assert the token type which allows us to perform additional work on the // token that is needed before returning the call. token := existing.(*structs.ACLToken) // Handle potential staleness of ACL role links. if token, err = s.fixTokenRoleLinks(txn, token); err != nil { return nil, err } return token, nil } // ACLTokenBySecretID is used to lookup a token by secret ID func (s *StateStore) ACLTokenBySecretID(ws memdb.WatchSet, secretID string) (*structs.ACLToken, error) { if secretID == "" { return nil, fmt.Errorf("acl token lookup failed: missing secret id") } txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("acl_token", "secret", secretID) if err != nil { return nil, fmt.Errorf("acl token lookup failed: %v", err) } ws.Add(watchCh) // If the existing token is nil, this indicates it does not exist in state. if existing == nil { return nil, nil } // Assert the token type which allows us to perform additional work on the // token that is needed before returning the call. token := existing.(*structs.ACLToken) // Handle potential staleness of ACL role links. if token, err = s.fixTokenRoleLinks(txn, token); err != nil { return nil, err } return token, nil } // ACLTokenByAccessorIDPrefix is used to lookup tokens by prefix func (s *StateStore) ACLTokenByAccessorIDPrefix(ws memdb.WatchSet, prefix string, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var iter memdb.ResultIterator var err error switch sort { case SortReverse: iter, err = txn.GetReverse("acl_token", "id_prefix", prefix) default: iter, err = txn.Get("acl_token", "id_prefix", prefix) } if err != nil { return nil, fmt.Errorf("acl token lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // ACLTokens returns an iterator over all the tokens func (s *StateStore) ACLTokens(ws memdb.WatchSet, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var iter memdb.ResultIterator var err error switch sort { case SortReverse: iter, err = txn.GetReverse("acl_token", "create") default: iter, err = txn.Get("acl_token", "create") } if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // ACLTokensByGlobal returns an iterator over all the tokens filtered by global value func (s *StateStore) ACLTokensByGlobal(ws memdb.WatchSet, globalVal bool, sort SortOption) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() var iter memdb.ResultIterator var err error // Walk the entire table switch sort { case SortReverse: iter, err = txn.GetReverse("acl_token", "global", globalVal) default: iter, err = txn.Get("acl_token", "global", globalVal) } if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // CanBootstrapACLToken checks if bootstrapping is possible and returns the reset index func (s *StateStore) CanBootstrapACLToken() (bool, uint64, error) { txn := s.db.ReadTxn() // Lookup the bootstrap sentinel out, err := txn.First("index", "id", "acl_token_bootstrap") if err != nil { return false, 0, err } // No entry, we haven't bootstrapped yet if out == nil { return true, 0, nil } // Return the reset index if we've already bootstrapped return false, out.(*IndexEntry).Value, nil } // BootstrapACLTokens is used to create an initial ACL token. func (s *StateStore) BootstrapACLTokens(msgType structs.MessageType, index uint64, resetIndex uint64, token *structs.ACLToken) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // Check if we have already done a bootstrap existing, err := txn.First("index", "id", "acl_token_bootstrap") if err != nil { return fmt.Errorf("bootstrap check failed: %v", err) } if existing != nil { if resetIndex == 0 { return fmt.Errorf("ACL bootstrap already done") } else if resetIndex != existing.(*IndexEntry).Value { return fmt.Errorf("Invalid reset index for ACL bootstrap") } } // Update the Create/Modify time token.CreateIndex = index token.ModifyIndex = index // Insert the token if err := txn.Insert("acl_token", token); err != nil { return fmt.Errorf("upserting token failed: %v", err) } // Update the indexes table, prevents future bootstrap until reset if err := txn.Insert("index", &IndexEntry{"acl_token", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } if err := txn.Insert("index", &IndexEntry{"acl_token_bootstrap", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // UpsertOneTimeToken is used to create or update a set of ACL // tokens. Validating that we're not upserting an already-expired token is // made the responsibility of the caller to facilitate testing. func (s *StateStore) UpsertOneTimeToken(msgType structs.MessageType, index uint64, token *structs.OneTimeToken) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() // we expect the RPC call to set the ExpiresAt if token.ExpiresAt.IsZero() { return fmt.Errorf("one-time token must have an ExpiresAt time") } // Update all the indexes token.CreateIndex = index token.ModifyIndex = index // Create the token if err := txn.Insert("one_time_token", token); err != nil { return fmt.Errorf("upserting one-time token failed: %v", err) } // Update the indexes table if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteOneTimeTokens deletes the tokens with the given ACLToken Accessor IDs func (s *StateStore) DeleteOneTimeTokens(msgType structs.MessageType, index uint64, ids []string) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() var deleted int for _, id := range ids { d, err := txn.DeleteAll("one_time_token", "id", id) if err != nil { return fmt.Errorf("deleting one-time token failed: %v", err) } deleted += d } if deleted > 0 { if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } return txn.Commit() } // ExpireOneTimeTokens deletes tokens that have expired func (s *StateStore) ExpireOneTimeTokens(msgType structs.MessageType, index uint64, timestamp time.Time) error { txn := s.db.WriteTxnMsgT(msgType, index) defer txn.Abort() iter, err := s.oneTimeTokensExpiredTxn(txn, nil, timestamp) if err != nil { return err } var deleted int for { raw := iter.Next() if raw == nil { break } ott, ok := raw.(*structs.OneTimeToken) if !ok || ott == nil { return fmt.Errorf("could not decode one-time token") } d, err := txn.DeleteAll("one_time_token", "secret", ott.OneTimeSecretID) if err != nil { return fmt.Errorf("deleting one-time token failed: %v", err) } deleted += d } if deleted > 0 { if err := txn.Insert("index", &IndexEntry{"one_time_token", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } return txn.Commit() } // oneTimeTokensExpiredTxn returns an iterator over all expired one-time tokens func (s *StateStore) oneTimeTokensExpiredTxn(txn *txn, ws memdb.WatchSet, timestamp time.Time) (memdb.ResultIterator, error) { iter, err := txn.Get("one_time_token", "id") if err != nil { return nil, fmt.Errorf("one-time token lookup failed: %v", err) } ws.Add(iter.WatchCh()) iter = memdb.NewFilterIterator(iter, expiredOneTimeTokenFilter(timestamp)) return iter, nil } // OneTimeTokenBySecret is used to lookup a token by secret func (s *StateStore) OneTimeTokenBySecret(ws memdb.WatchSet, secret string) (*structs.OneTimeToken, error) { if secret == "" { return nil, fmt.Errorf("one-time token lookup failed: missing secret") } txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("one_time_token", "secret", secret) if err != nil { return nil, fmt.Errorf("one-time token lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.OneTimeToken), nil } return nil, nil } // expiredOneTimeTokenFilter returns a filter function that returns only // expired one-time tokens func expiredOneTimeTokenFilter(now time.Time) func(interface{}) bool { return func(raw interface{}) bool { ott, ok := raw.(*structs.OneTimeToken) if !ok { return true } return ott.ExpiresAt.After(now) } } // SchedulerConfig is used to get the current Scheduler configuration. func (s *StateStore) SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error) { tx := s.db.ReadTxn() defer tx.Abort() return s.schedulerConfigTxn(tx) } func (s *StateStore) schedulerConfigTxn(txn *txn) (uint64, *structs.SchedulerConfiguration, error) { // Get the scheduler config c, err := txn.First("scheduler_config", "id") if err != nil { return 0, nil, fmt.Errorf("failed scheduler config lookup: %s", err) } config, ok := c.(*structs.SchedulerConfiguration) if !ok { return 0, nil, nil } return config.ModifyIndex, config, nil } // SchedulerSetConfig is used to set the current Scheduler configuration. func (s *StateStore) SchedulerSetConfig(index uint64, config *structs.SchedulerConfiguration) error { tx := s.db.WriteTxn(index) defer tx.Abort() s.schedulerSetConfigTxn(index, tx, config) return tx.Commit() } func (s *StateStore) ClusterMetadata(ws memdb.WatchSet) (*structs.ClusterMetadata, error) { txn := s.db.ReadTxn() defer txn.Abort() // Get the cluster metadata watchCh, m, err := txn.FirstWatch("cluster_meta", "id") if err != nil { return nil, fmt.Errorf("failed cluster metadata lookup: %w", err) } ws.Add(watchCh) if m != nil { return m.(*structs.ClusterMetadata), nil } return nil, nil } func (s *StateStore) ClusterSetMetadata(index uint64, meta *structs.ClusterMetadata) error { txn := s.db.WriteTxn(index) defer txn.Abort() if err := s.setClusterMetadata(txn, meta); err != nil { return fmt.Errorf("set cluster metadata failed: %w", err) } return txn.Commit() } // WithWriteTransaction executes the passed function within a write transaction, // and returns its result. If the invocation returns no error, the transaction // is committed; otherwise, it's aborted. func (s *StateStore) WithWriteTransaction(msgType structs.MessageType, index uint64, fn func(Txn) error) error { tx := s.db.WriteTxnMsgT(msgType, index) defer tx.Abort() err := fn(tx) if err == nil { return tx.Commit() } return err } // SchedulerCASConfig is used to update the scheduler configuration with a // given Raft index. If the CAS index specified is not equal to the last observed index // for the config, then the call is a noop. func (s *StateStore) SchedulerCASConfig(index, cidx uint64, config *structs.SchedulerConfiguration) (bool, error) { tx := s.db.WriteTxn(index) defer tx.Abort() // Check for an existing config existing, err := tx.First("scheduler_config", "id") if err != nil { return false, fmt.Errorf("failed scheduler config lookup: %s", err) } // If the existing index does not match the provided CAS // index arg, then we shouldn't update anything and can safely // return early here. e, ok := existing.(*structs.SchedulerConfiguration) if !ok || (e != nil && e.ModifyIndex != cidx) { return false, nil } s.schedulerSetConfigTxn(index, tx, config) if err := tx.Commit(); err != nil { return false, err } return true, nil } func (s *StateStore) schedulerSetConfigTxn(idx uint64, tx *txn, config *structs.SchedulerConfiguration) error { // Check for an existing config existing, err := tx.First("scheduler_config", "id") if err != nil { return fmt.Errorf("failed scheduler config lookup: %s", err) } // Set the indexes. if existing != nil { config.CreateIndex = existing.(*structs.SchedulerConfiguration).CreateIndex } else { config.CreateIndex = idx } config.ModifyIndex = idx if err := tx.Insert("scheduler_config", config); err != nil { return fmt.Errorf("failed updating scheduler config: %s", err) } return nil } func (s *StateStore) setClusterMetadata(txn *txn, meta *structs.ClusterMetadata) error { // Check for an existing config, if it exists, verify that the cluster ID matches existing, err := txn.First("cluster_meta", "id") if err != nil { return fmt.Errorf("failed cluster meta lookup: %v", err) } if existing != nil { existingClusterID := existing.(*structs.ClusterMetadata).ClusterID if meta.ClusterID != existingClusterID && existingClusterID != "" { // there is a bug in cluster ID detection return fmt.Errorf("refusing to set new cluster id, previous: %s, new: %s", existingClusterID, meta.ClusterID) } } // update is technically a noop, unless someday we add more / mutable fields if err := txn.Insert("cluster_meta", meta); err != nil { return fmt.Errorf("set cluster metadata failed: %v", err) } return nil } // UpsertScalingPolicies is used to insert a new scaling policy. func (s *StateStore) UpsertScalingPolicies(index uint64, scalingPolicies []*structs.ScalingPolicy) error { txn := s.db.WriteTxn(index) defer txn.Abort() if err := s.UpsertScalingPoliciesTxn(index, scalingPolicies, txn); err != nil { return err } return txn.Commit() } // UpsertScalingPoliciesTxn is used to insert a new scaling policy. func (s *StateStore) UpsertScalingPoliciesTxn(index uint64, scalingPolicies []*structs.ScalingPolicy, txn *txn) error { hadUpdates := false for _, policy := range scalingPolicies { // Check if the scaling policy already exists // Policy uniqueness is based on target and type it, err := txn.Get("scaling_policy", "target", policy.Target[structs.ScalingTargetNamespace], policy.Target[structs.ScalingTargetJob], policy.Target[structs.ScalingTargetGroup], policy.Target[structs.ScalingTargetTask], ) if err != nil { return fmt.Errorf("scaling policy lookup failed: %v", err) } // Check if type matches var existing *structs.ScalingPolicy for raw := it.Next(); raw != nil; raw = it.Next() { p := raw.(*structs.ScalingPolicy) if p.Type == policy.Type { existing = p break } } // Setup the indexes correctly if existing != nil { if !existing.Diff(policy) { continue } policy.ID = existing.ID policy.CreateIndex = existing.CreateIndex } else { // policy.ID must have been set already in Job.Register before log apply policy.CreateIndex = index } policy.ModifyIndex = index // Insert the scaling policy hadUpdates = true if err := txn.Insert("scaling_policy", policy); err != nil { return err } } // Update the indexes table for scaling policy if we updated any policies if hadUpdates { if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } } return nil } // NamespaceByName is used to lookup a namespace by name func (s *StateStore) NamespaceByName(ws memdb.WatchSet, name string) (*structs.Namespace, error) { txn := s.db.ReadTxn() return s.namespaceByNameImpl(ws, txn, name) } // namespaceByNameImpl is used to lookup a namespace by name func (s *StateStore) namespaceByNameImpl(ws memdb.WatchSet, txn *txn, name string) (*structs.Namespace, error) { watchCh, existing, err := txn.FirstWatch(TableNamespaces, "id", name) if err != nil { return nil, fmt.Errorf("namespace lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.Namespace), nil } return nil, nil } // namespaceExists returns whether a namespace exists func (s *StateStore) namespaceExists(txn *txn, namespace string) (bool, error) { if namespace == structs.DefaultNamespace { return true, nil } existing, err := txn.First(TableNamespaces, "id", namespace) if err != nil { return false, fmt.Errorf("namespace lookup failed: %v", err) } return existing != nil, nil } // NamespacesByNamePrefix is used to lookup namespaces by prefix func (s *StateStore) NamespacesByNamePrefix(ws memdb.WatchSet, namePrefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get(TableNamespaces, "id_prefix", namePrefix) if err != nil { return nil, fmt.Errorf("namespaces lookup failed: %v", err) } ws.Add(iter.WatchCh()) return iter, nil } // Namespaces returns an iterator over all the namespaces func (s *StateStore) Namespaces(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire namespace table iter, err := txn.Get(TableNamespaces, "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } func (s *StateStore) NamespaceNames() ([]string, error) { it, err := s.Namespaces(nil) if err != nil { return nil, err } nses := []string{} for { next := it.Next() if next == nil { break } ns := next.(*structs.Namespace) nses = append(nses, ns.Name) } return nses, nil } // UpsertNamespaces is used to register or update a set of namespaces. func (s *StateStore) UpsertNamespaces(index uint64, namespaces []*structs.Namespace) error { txn := s.db.WriteTxn(index) defer txn.Abort() for _, ns := range namespaces { if err := s.upsertNamespaceImpl(index, txn, ns); err != nil { return err } } if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // upsertNamespaceImpl is used to upsert a namespace func (s *StateStore) upsertNamespaceImpl(index uint64, txn *txn, namespace *structs.Namespace) error { // Ensure the namespace hash is non-nil. This should be done outside the state store // for performance reasons, but we check here for defense in depth. ns := namespace if len(ns.Hash) == 0 { ns.SetHash() } // Check if the namespace already exists existing, err := txn.First(TableNamespaces, "id", ns.Name) if err != nil { return fmt.Errorf("namespace lookup failed: %v", err) } // Setup the indexes correctly and determine which quotas need to be // reconciled var oldQuota string if existing != nil { exist := existing.(*structs.Namespace) ns.CreateIndex = exist.CreateIndex ns.ModifyIndex = index // Grab the old quota on the namespace oldQuota = exist.Quota } else { ns.CreateIndex = index ns.ModifyIndex = index } // Validate that the quota on the new namespace exists if ns.Quota != "" { exists, err := s.quotaSpecExists(txn, ns.Quota) if err != nil { return fmt.Errorf("looking up namespace quota %q failed: %v", ns.Quota, err) } else if !exists { return fmt.Errorf("namespace %q using non-existent quota %q", ns.Name, ns.Quota) } } // Insert the namespace if err := txn.Insert(TableNamespaces, ns); err != nil { return fmt.Errorf("namespace insert failed: %v", err) } // Reconcile changed quotas return s.quotaReconcile(index, txn, ns.Quota, oldQuota) } // DeleteNamespaces is used to remove a set of namespaces func (s *StateStore) DeleteNamespaces(index uint64, names []string) error { txn := s.db.WriteTxn(index) defer txn.Abort() for _, name := range names { // Lookup the namespace existing, err := txn.First(TableNamespaces, "id", name) if err != nil { return fmt.Errorf("namespace lookup failed: %v", err) } if existing == nil { return fmt.Errorf("namespace not found") } ns := existing.(*structs.Namespace) if ns.Name == structs.DefaultNamespace { return fmt.Errorf("default namespace can not be deleted") } // Ensure that the namespace doesn't have any non-terminal jobs iter, err := s.jobsByNamespaceImpl(nil, name, txn) if err != nil { return err } for { raw := iter.Next() if raw == nil { break } job := raw.(*structs.Job) if job.Status != structs.JobStatusDead { return fmt.Errorf("namespace %q contains at least one non-terminal job %q. "+ "All jobs must be terminal in namespace before it can be deleted", name, job.ID) } } vIter, err := s.csiVolumesByNamespaceImpl(txn, nil, name, "") if err != nil { return err } rawVol := vIter.Next() if rawVol != nil { vol := rawVol.(*structs.CSIVolume) return fmt.Errorf("namespace %q contains at least one CSI volume %q. "+ "All CSI volumes in namespace must be deleted before it can be deleted", name, vol.ID) } varIter, err := s.getVariablesByNamespaceImpl(txn, nil, name) if err != nil { return err } if varIter.Next() != nil { // unlike job/volume, don't show the path here because the user may // not have List permissions on the vars in this namespace return fmt.Errorf("namespace %q contains at least one variable. "+ "All variables in namespace must be deleted before it can be deleted", name) } // Delete the namespace if err := txn.Delete(TableNamespaces, existing); err != nil { return fmt.Errorf("namespace deletion failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{TableNamespaces, index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } func (s *StateStore) DeleteScalingPolicies(index uint64, ids []string) error { txn := s.db.WriteTxn(index) defer txn.Abort() err := s.DeleteScalingPoliciesTxn(index, ids, txn) if err == nil { return txn.Commit() } return err } // DeleteScalingPoliciesTxn is used to delete a set of scaling policies by ID. func (s *StateStore) DeleteScalingPoliciesTxn(index uint64, ids []string, txn *txn) error { if len(ids) == 0 { return nil } for _, id := range ids { // Lookup the scaling policy existing, err := txn.First("scaling_policy", "id", id) if err != nil { return fmt.Errorf("scaling policy lookup failed: %v", err) } if existing == nil { return fmt.Errorf("scaling policy not found") } // Delete the scaling policy if err := txn.Delete("scaling_policy", existing); err != nil { return fmt.Errorf("scaling policy delete failed: %v", err) } } if err := txn.Insert("index", &IndexEntry{"scaling_policy", index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return nil } // ScalingPolicies returns an iterator over all the scaling policies func (s *StateStore) ScalingPolicies(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() // Walk the entire scaling_policy table iter, err := txn.Get("scaling_policy", "id") if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // ScalingPoliciesByTypePrefix returns an iterator over scaling policies with a certain type prefix. func (s *StateStore) ScalingPoliciesByTypePrefix(ws memdb.WatchSet, t string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("scaling_policy", "type_prefix", t) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } func (s *StateStore) ScalingPoliciesByNamespace(ws memdb.WatchSet, namespace, typ string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("scaling_policy", "target_prefix", namespace) if err != nil { return nil, err } ws.Add(iter.WatchCh()) // Wrap the iterator in a filter to exact match the namespace iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace)) // If policy type is specified as well, wrap again if typ != "" { iter = memdb.NewFilterIterator(iter, func(raw interface{}) bool { p, ok := raw.(*structs.ScalingPolicy) if !ok { return true } return !strings.HasPrefix(p.Type, typ) }) } return iter, nil } func (s *StateStore) ScalingPoliciesByJob(ws memdb.WatchSet, namespace, jobID, policyType string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := s.ScalingPoliciesByJobTxn(ws, namespace, jobID, txn) if err != nil { return nil, err } if policyType == "" { return iter, nil } filter := func(raw interface{}) bool { p, ok := raw.(*structs.ScalingPolicy) if !ok { return true } return policyType != p.Type } return memdb.NewFilterIterator(iter, filter), nil } func (s *StateStore) ScalingPoliciesByJobTxn(ws memdb.WatchSet, namespace, jobID string, txn *txn) (memdb.ResultIterator, error) { iter, err := txn.Get("scaling_policy", "target_prefix", namespace, jobID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) filter := func(raw interface{}) bool { d, ok := raw.(*structs.ScalingPolicy) if !ok { return true } return d.Target[structs.ScalingTargetJob] != jobID } // Wrap the iterator in a filter wrap := memdb.NewFilterIterator(iter, filter) return wrap, nil } func (s *StateStore) ScalingPolicyByID(ws memdb.WatchSet, id string) (*structs.ScalingPolicy, error) { txn := s.db.ReadTxn() watchCh, existing, err := txn.FirstWatch("scaling_policy", "id", id) if err != nil { return nil, fmt.Errorf("scaling_policy lookup failed: %v", err) } ws.Add(watchCh) if existing != nil { return existing.(*structs.ScalingPolicy), nil } return nil, nil } // ScalingPolicyByTargetAndType returns a fully-qualified policy against a target and policy type, // or nil if it does not exist. This method does not honor the watchset on the policy type, just the target. func (s *StateStore) ScalingPolicyByTargetAndType(ws memdb.WatchSet, target map[string]string, typ string) (*structs.ScalingPolicy, error) { txn := s.db.ReadTxn() namespace := target[structs.ScalingTargetNamespace] job := target[structs.ScalingTargetJob] group := target[structs.ScalingTargetGroup] task := target[structs.ScalingTargetTask] it, err := txn.Get("scaling_policy", "target", namespace, job, group, task) if err != nil { return nil, fmt.Errorf("scaling_policy lookup failed: %v", err) } ws.Add(it.WatchCh()) // Check for type var existing *structs.ScalingPolicy for raw := it.Next(); raw != nil; raw = it.Next() { p := raw.(*structs.ScalingPolicy) if p.Type == typ { existing = p break } } if existing != nil { return existing, nil } return nil, nil } func (s *StateStore) ScalingPoliciesByIDPrefix(ws memdb.WatchSet, namespace string, prefix string) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get("scaling_policy", "id_prefix", prefix) if err != nil { return nil, fmt.Errorf("scaling policy lookup failed: %v", err) } ws.Add(iter.WatchCh()) iter = memdb.NewFilterIterator(iter, scalingPolicyNamespaceFilter(namespace)) return iter, nil } // scalingPolicyNamespaceFilter returns a filter function that filters all // scaling policies not targeting the given namespace. func scalingPolicyNamespaceFilter(namespace string) func(interface{}) bool { return func(raw interface{}) bool { p, ok := raw.(*structs.ScalingPolicy) if !ok { return true } return p.Target[structs.ScalingTargetNamespace] != namespace } } // StateSnapshot is used to provide a point-in-time snapshot type StateSnapshot struct { StateStore } // DenormalizeAllocationsMap takes in a map of nodes to allocations, and queries the // Allocation for each of the Allocation diffs and merges the updated attributes with // the existing Allocation, and attaches the Job provided func (s *StateSnapshot) DenormalizeAllocationsMap(nodeAllocations map[string][]*structs.Allocation) error { for nodeID, allocs := range nodeAllocations { denormalizedAllocs, err := s.DenormalizeAllocationSlice(allocs) if err != nil { return err } nodeAllocations[nodeID] = denormalizedAllocs } return nil } // DenormalizeAllocationSlice queries the Allocation for each allocation diff // represented as an Allocation and merges the updated attributes with the existing // Allocation, and attaches the Job provided. // // This should only be called on terminal allocs, particularly stopped or preempted allocs func (s *StateSnapshot) DenormalizeAllocationSlice(allocs []*structs.Allocation) ([]*structs.Allocation, error) { allocDiffs := make([]*structs.AllocationDiff, len(allocs)) for i, alloc := range allocs { allocDiffs[i] = alloc.AllocationDiff() } return s.DenormalizeAllocationDiffSlice(allocDiffs) } // DenormalizeAllocationDiffSlice queries the Allocation for each AllocationDiff and merges // the updated attributes with the existing Allocation, and attaches the Job provided. // // This should only be called on terminal alloc, particularly stopped or preempted allocs func (s *StateSnapshot) DenormalizeAllocationDiffSlice(allocDiffs []*structs.AllocationDiff) ([]*structs.Allocation, error) { // Output index for denormalized Allocations j := 0 denormalizedAllocs := make([]*structs.Allocation, len(allocDiffs)) for _, allocDiff := range allocDiffs { alloc, err := s.AllocByID(nil, allocDiff.ID) if err != nil { return nil, fmt.Errorf("alloc lookup failed: %v", err) } if alloc == nil { return nil, fmt.Errorf("alloc %v doesn't exist", allocDiff.ID) } // Merge the updates to the Allocation. Don't update alloc.Job for terminal allocs // so alloc refers to the latest Job view before destruction and to ease handler implementations allocCopy := alloc.Copy() if allocDiff.PreemptedByAllocation != "" { allocCopy.PreemptedByAllocation = allocDiff.PreemptedByAllocation allocCopy.DesiredDescription = getPreemptedAllocDesiredDescription(allocDiff.PreemptedByAllocation) allocCopy.DesiredStatus = structs.AllocDesiredStatusEvict } else { // If alloc is a stopped alloc allocCopy.DesiredDescription = allocDiff.DesiredDescription allocCopy.DesiredStatus = structs.AllocDesiredStatusStop if allocDiff.ClientStatus != "" { allocCopy.ClientStatus = allocDiff.ClientStatus } if allocDiff.FollowupEvalID != "" { allocCopy.FollowupEvalID = allocDiff.FollowupEvalID } } if allocDiff.ModifyTime != 0 { allocCopy.ModifyTime = allocDiff.ModifyTime } // Update the allocDiff in the slice to equal the denormalized alloc denormalizedAllocs[j] = allocCopy j++ } // Retain only the denormalized Allocations in the slice denormalizedAllocs = denormalizedAllocs[:j] return denormalizedAllocs, nil } func getPreemptedAllocDesiredDescription(preemptedByAllocID string) string { return fmt.Sprintf("Preempted by alloc ID %v", preemptedByAllocID) } // UpsertRootKeyMeta saves root key meta or updates it in-place. func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKeyMeta, rekey bool) error { txn := s.db.WriteTxn(index) defer txn.Abort() // get any existing key for updating raw, err := txn.First(TableRootKeyMeta, indexID, rootKeyMeta.KeyID) if err != nil { return fmt.Errorf("root key metadata lookup failed: %v", err) } isRotation := false if raw != nil { existing := raw.(*structs.RootKeyMeta) rootKeyMeta.CreateIndex = existing.CreateIndex rootKeyMeta.CreateTime = existing.CreateTime isRotation = !existing.Active() && rootKeyMeta.Active() } else { rootKeyMeta.CreateIndex = index isRotation = rootKeyMeta.Active() } rootKeyMeta.ModifyIndex = index if rekey && !isRotation { return fmt.Errorf("cannot rekey without setting the new key active") } // if the upsert is for a newly-active key, we need to set all the // other keys as inactive in the same transaction. if isRotation { iter, err := txn.Get(TableRootKeyMeta, indexID) if err != nil { return err } for { raw := iter.Next() if raw == nil { break } key := raw.(*structs.RootKeyMeta) modified := false switch key.State { case structs.RootKeyStateInactive: if rekey { key.SetRekeying() modified = true } case structs.RootKeyStateActive: if rekey { key.SetRekeying() } else { key.SetInactive() } modified = true case structs.RootKeyStateRekeying, structs.RootKeyStateDeprecated: // nothing to do } if modified { key.ModifyIndex = index if err := txn.Insert(TableRootKeyMeta, key); err != nil { return err } } } } if err := txn.Insert(TableRootKeyMeta, rootKeyMeta); err != nil { return err } // update the indexes table if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // DeleteRootKeyMeta deletes a single root key, or returns an error if // it doesn't exist. func (s *StateStore) DeleteRootKeyMeta(index uint64, keyID string) error { txn := s.db.WriteTxn(index) defer txn.Abort() // find the old key existing, err := txn.First(TableRootKeyMeta, indexID, keyID) if err != nil { return fmt.Errorf("root key metadata lookup failed: %v", err) } if existing == nil { return fmt.Errorf("root key metadata not found") } if err := txn.Delete(TableRootKeyMeta, existing); err != nil { return fmt.Errorf("root key metadata delete failed: %v", err) } // update the indexes table if err := txn.Insert("index", &IndexEntry{TableRootKeyMeta, index}); err != nil { return fmt.Errorf("index update failed: %v", err) } return txn.Commit() } // RootKeyMetas returns an iterator over all root key metadata func (s *StateStore) RootKeyMetas(ws memdb.WatchSet) (memdb.ResultIterator, error) { txn := s.db.ReadTxn() iter, err := txn.Get(TableRootKeyMeta, indexID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) return iter, nil } // RootKeyMetaByID returns a specific root key meta func (s *StateStore) RootKeyMetaByID(ws memdb.WatchSet, id string) (*structs.RootKeyMeta, error) { txn := s.db.ReadTxn() watchCh, raw, err := txn.FirstWatch(TableRootKeyMeta, indexID, id) if err != nil { return nil, fmt.Errorf("root key metadata lookup failed: %v", err) } ws.Add(watchCh) if raw != nil { return raw.(*structs.RootKeyMeta), nil } return nil, nil } // GetActiveRootKeyMeta returns the metadata for the currently active root key func (s *StateStore) GetActiveRootKeyMeta(ws memdb.WatchSet) (*structs.RootKeyMeta, error) { txn := s.db.ReadTxn() iter, err := txn.Get(TableRootKeyMeta, indexID) if err != nil { return nil, err } ws.Add(iter.WatchCh()) for { raw := iter.Next() if raw == nil { break } key := raw.(*structs.RootKeyMeta) if key.Active() { return key, nil } } return nil, nil }