2015-08-22 00:23:01 +00:00
|
|
|
package state
|
|
|
|
|
|
|
|
import (
|
2015-08-25 03:51:07 +00:00
|
|
|
"errors"
|
2015-08-22 00:23:01 +00:00
|
|
|
"fmt"
|
|
|
|
|
2016-06-06 20:19:31 +00:00
|
|
|
"github.com/hashicorp/consul/types"
|
2015-08-22 00:23:01 +00:00
|
|
|
"github.com/hashicorp/go-memdb"
|
|
|
|
)
|
|
|
|
|
2015-08-25 03:51:07 +00:00
|
|
|
var (
|
|
|
|
// ErrMissingNode is the error returned when trying an operation
|
|
|
|
// which requires a node registration but none exists.
|
|
|
|
ErrMissingNode = errors.New("Missing node registration")
|
|
|
|
|
|
|
|
// ErrMissingService is the error we return if trying an
|
|
|
|
// operation which requires a service but none exists.
|
|
|
|
ErrMissingService = errors.New("Missing service registration")
|
2015-09-04 02:11:12 +00:00
|
|
|
|
|
|
|
// ErrMissingSessionID is returned when a session registration
|
|
|
|
// is attempted with an empty session ID.
|
|
|
|
ErrMissingSessionID = errors.New("Missing session ID")
|
2015-09-07 04:13:45 +00:00
|
|
|
|
2015-11-07 00:59:32 +00:00
|
|
|
// ErrMissingACLID is returned when an ACL set is called on
|
|
|
|
// an ACL with an empty ID.
|
2015-09-07 04:13:45 +00:00
|
|
|
ErrMissingACLID = errors.New("Missing ACL ID")
|
2015-11-07 00:59:32 +00:00
|
|
|
|
|
|
|
// ErrMissingQueryID is returned when a Query set is called on
|
|
|
|
// a Query with an empty ID.
|
|
|
|
ErrMissingQueryID = errors.New("Missing Query ID")
|
2015-08-25 03:51:07 +00:00
|
|
|
)
|
|
|
|
|
2017-01-20 07:36:50 +00:00
|
|
|
const (
|
|
|
|
// watchLimit is used as a soft limit to cap how many watches we allow
|
|
|
|
// for a given blocking query. If this is exceeded, then we will use a
|
|
|
|
// higher-level watch that's less fine-grained. This isn't as bad as it
|
|
|
|
// seems since we have made the main culprits (nodes and services) more
|
|
|
|
// efficient by diffing before we update via register requests.
|
|
|
|
//
|
|
|
|
// Given the current size of aFew == 32 in memdb's watch_few.go, this
|
|
|
|
// will allow for up to ~64 goroutines per blocking query.
|
|
|
|
watchLimit = 2048
|
|
|
|
)
|
|
|
|
|
2015-08-22 19:44:33 +00:00
|
|
|
// StateStore is where we store all of Consul's state, including
|
|
|
|
// records of node registrations, services, checks, key/value
|
|
|
|
// pairs and more. The DB is entirely in-memory and is constructed
|
|
|
|
// from the Raft log through the FSM.
|
2015-08-22 00:23:01 +00:00
|
|
|
type StateStore struct {
|
2015-09-25 19:01:46 +00:00
|
|
|
schema *memdb.DBSchema
|
|
|
|
db *memdb.MemDB
|
|
|
|
|
2017-01-24 18:38:03 +00:00
|
|
|
// abandonCh is used to signal watchers that this state store has been
|
|
|
|
// abandoned (usually during a restore). This is only ever closed.
|
|
|
|
abandonCh chan struct{}
|
|
|
|
|
2015-09-25 19:01:46 +00:00
|
|
|
// tableWatches holds all the full table watches, indexed by table name.
|
|
|
|
tableWatches map[string]*FullTableWatch
|
|
|
|
|
|
|
|
// kvsWatch holds the special prefix watch for the key value store.
|
2016-01-20 05:46:22 +00:00
|
|
|
kvsWatch *PrefixWatchManager
|
2015-09-25 19:01:46 +00:00
|
|
|
|
|
|
|
// kvsGraveyard manages tombstones for the key value store.
|
|
|
|
kvsGraveyard *Graveyard
|
|
|
|
|
|
|
|
// lockDelay holds expiration times for locks associated with keys.
|
|
|
|
lockDelay *Delay
|
2015-09-20 08:36:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// StateSnapshot is used to provide a point-in-time snapshot. It
|
|
|
|
// works by starting a read transaction against the whole state store.
|
|
|
|
type StateSnapshot struct {
|
2015-09-25 19:01:46 +00:00
|
|
|
store *StateStore
|
2015-09-20 08:36:39 +00:00
|
|
|
tx *memdb.Txn
|
|
|
|
lastIndex uint64
|
2015-08-22 00:23:01 +00:00
|
|
|
}
|
|
|
|
|
2015-10-20 06:06:59 +00:00
|
|
|
// StateRestore is used to efficiently manage restoring a large amount of
|
|
|
|
// data to a state store.
|
|
|
|
type StateRestore struct {
|
|
|
|
store *StateStore
|
|
|
|
tx *memdb.Txn
|
|
|
|
watches *DumbWatchManager
|
|
|
|
}
|
|
|
|
|
2015-08-22 19:44:33 +00:00
|
|
|
// IndexEntry keeps a record of the last index per-table.
|
2015-08-22 00:23:01 +00:00
|
|
|
type IndexEntry struct {
|
|
|
|
Key string
|
|
|
|
Value uint64
|
|
|
|
}
|
|
|
|
|
2015-09-04 02:11:12 +00:00
|
|
|
// sessionCheck is used to create a many-to-one table such that
|
|
|
|
// each check registered by a session can be mapped back to the
|
|
|
|
// session table. This is only used internally in the state
|
|
|
|
// store and thus it is not exported.
|
|
|
|
type sessionCheck struct {
|
|
|
|
Node string
|
2016-06-07 20:24:51 +00:00
|
|
|
CheckID types.CheckID
|
2015-09-04 02:11:12 +00:00
|
|
|
Session string
|
|
|
|
}
|
|
|
|
|
2015-08-22 19:44:33 +00:00
|
|
|
// NewStateStore creates a new in-memory state storage layer.
|
2015-10-12 07:42:09 +00:00
|
|
|
func NewStateStore(gc *TombstoneGC) (*StateStore, error) {
|
2015-09-20 08:36:39 +00:00
|
|
|
// Create the in-memory DB.
|
|
|
|
schema := stateStoreSchema()
|
|
|
|
db, err := memdb.NewMemDB(schema)
|
2015-08-22 00:23:01 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("Failed setting up state store: %s", err)
|
|
|
|
}
|
|
|
|
|
2015-09-25 19:01:46 +00:00
|
|
|
// Build up the all-table watches.
|
|
|
|
tableWatches := make(map[string]*FullTableWatch)
|
|
|
|
for table, _ := range schema.Tables {
|
2015-10-09 02:24:50 +00:00
|
|
|
if table == "kvs" || table == "tombstones" {
|
2015-09-25 19:01:46 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
tableWatches[table] = NewFullTableWatch()
|
2015-09-20 08:36:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create and return the state store.
|
2015-08-22 00:23:01 +00:00
|
|
|
s := &StateStore{
|
2015-09-25 19:01:46 +00:00
|
|
|
schema: schema,
|
|
|
|
db: db,
|
2017-01-24 18:38:03 +00:00
|
|
|
abandonCh: make(chan struct{}),
|
2015-09-25 19:01:46 +00:00
|
|
|
tableWatches: tableWatches,
|
2016-01-20 05:46:22 +00:00
|
|
|
kvsWatch: NewPrefixWatchManager(),
|
2015-10-12 07:42:09 +00:00
|
|
|
kvsGraveyard: NewGraveyard(gc),
|
2015-09-25 19:01:46 +00:00
|
|
|
lockDelay: NewDelay(),
|
2015-08-22 00:23:01 +00:00
|
|
|
}
|
|
|
|
return s, nil
|
|
|
|
}
|
2015-08-22 19:44:33 +00:00
|
|
|
|
2015-09-20 08:36:39 +00:00
|
|
|
// Snapshot is used to create a point-in-time snapshot of the entire db.
|
|
|
|
func (s *StateStore) Snapshot() *StateSnapshot {
|
|
|
|
tx := s.db.Txn(false)
|
|
|
|
|
|
|
|
var tables []string
|
|
|
|
for table, _ := range s.schema.Tables {
|
|
|
|
tables = append(tables, table)
|
|
|
|
}
|
|
|
|
idx := maxIndexTxn(tx, tables...)
|
|
|
|
|
2015-09-25 19:01:46 +00:00
|
|
|
return &StateSnapshot{s, tx, idx}
|
2015-09-20 08:36:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// LastIndex returns that last index that affects the snapshotted data.
|
|
|
|
func (s *StateSnapshot) LastIndex() uint64 {
|
|
|
|
return s.lastIndex
|
|
|
|
}
|
|
|
|
|
|
|
|
// Close performs cleanup of a state snapshot.
|
|
|
|
func (s *StateSnapshot) Close() {
|
|
|
|
s.tx.Abort()
|
|
|
|
}
|
|
|
|
|
2015-10-20 06:06:59 +00:00
|
|
|
// Restore is used to efficiently manage restoring a large amount of data into
|
|
|
|
// the state store. It works by doing all the restores inside of a single
|
|
|
|
// transaction.
|
|
|
|
func (s *StateStore) Restore() *StateRestore {
|
|
|
|
tx := s.db.Txn(true)
|
|
|
|
watches := NewDumbWatchManager(s.tableWatches)
|
|
|
|
return &StateRestore{s, tx, watches}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Abort abandons the changes made by a restore. This or Commit should always be
|
|
|
|
// called.
|
|
|
|
func (s *StateRestore) Abort() {
|
|
|
|
s.tx.Abort()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Commit commits the changes made by a restore. This or Abort should always be
|
|
|
|
// called.
|
|
|
|
func (s *StateRestore) Commit() {
|
|
|
|
// Fire off a single KVS watch instead of a zillion prefix ones, and use
|
|
|
|
// a dumb watch manager to single-fire all the full table watches.
|
|
|
|
s.tx.Defer(func() { s.store.kvsWatch.Notify("", true) })
|
|
|
|
s.tx.Defer(func() { s.watches.Notify() })
|
|
|
|
|
|
|
|
s.tx.Commit()
|
|
|
|
}
|
|
|
|
|
2017-01-24 18:38:03 +00:00
|
|
|
// AbandonCh returns a channel you can wait on to know if the state store was
|
|
|
|
// abandoned.
|
|
|
|
func (s *StateStore) AbandonCh() <-chan struct{} {
|
|
|
|
return s.abandonCh
|
|
|
|
}
|
|
|
|
|
|
|
|
// Abandon is used to signal that the given state store has been abandoned.
|
|
|
|
// Calling this more than one time will panic.
|
|
|
|
func (s *StateStore) Abandon() {
|
|
|
|
close(s.abandonCh)
|
|
|
|
}
|
|
|
|
|
2015-08-25 02:03:28 +00:00
|
|
|
// maxIndex is a helper used to retrieve the highest known index
|
|
|
|
// amongst a set of tables in the db.
|
|
|
|
func (s *StateStore) maxIndex(tables ...string) uint64 {
|
|
|
|
tx := s.db.Txn(false)
|
|
|
|
defer tx.Abort()
|
2015-09-20 08:36:39 +00:00
|
|
|
return maxIndexTxn(tx, tables...)
|
|
|
|
}
|
2015-08-25 02:03:28 +00:00
|
|
|
|
2015-09-20 08:36:39 +00:00
|
|
|
// maxIndexTxn is a helper used to retrieve the highest known index
|
|
|
|
// amongst a set of tables in the db.
|
|
|
|
func maxIndexTxn(tx *memdb.Txn, tables ...string) uint64 {
|
2015-08-25 02:03:28 +00:00
|
|
|
var lindex uint64
|
|
|
|
for _, table := range tables {
|
|
|
|
ti, err := tx.First("index", "id", table)
|
|
|
|
if err != nil {
|
2015-09-25 19:01:46 +00:00
|
|
|
panic(fmt.Sprintf("unknown index: %s err: %s", table, err))
|
2015-08-25 02:03:28 +00:00
|
|
|
}
|
2015-09-02 20:32:12 +00:00
|
|
|
if idx, ok := ti.(*IndexEntry); ok && idx.Value > lindex {
|
|
|
|
lindex = idx.Value
|
2015-08-25 02:03:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return lindex
|
|
|
|
}
|
|
|
|
|
2015-09-20 08:36:39 +00:00
|
|
|
// indexUpdateMaxTxn is used when restoring entries and sets the table's index to
|
|
|
|
// the given idx only if it's greater than the current index.
|
|
|
|
func indexUpdateMaxTxn(tx *memdb.Txn, idx uint64, table string) error {
|
2015-09-25 19:01:46 +00:00
|
|
|
ti, err := tx.First("index", "id", table)
|
2015-09-20 08:36:39 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to retrieve existing index: %s", err)
|
|
|
|
}
|
|
|
|
|
2015-10-09 06:28:32 +00:00
|
|
|
// Always take the first update, otherwise do the > check.
|
|
|
|
if ti == nil {
|
|
|
|
if err := tx.Insert("index", &IndexEntry{table, idx}); err != nil {
|
|
|
|
return fmt.Errorf("failed updating index %s", err)
|
|
|
|
}
|
|
|
|
} else if cur, ok := ti.(*IndexEntry); ok && idx > cur.Value {
|
2015-09-20 08:36:39 +00:00
|
|
|
if err := tx.Insert("index", &IndexEntry{table, idx}); err != nil {
|
|
|
|
return fmt.Errorf("failed updating index %s", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-10-14 02:18:43 +00:00
|
|
|
// getWatchTables returns the list of tables that should be watched and used for
|
|
|
|
// max index calculations for the given query method. This is used for all
|
|
|
|
// methods except for KVS. This will panic if the method is unknown.
|
|
|
|
func (s *StateStore) getWatchTables(method string) []string {
|
|
|
|
switch method {
|
|
|
|
case "GetNode", "Nodes":
|
|
|
|
return []string{"nodes"}
|
|
|
|
case "Services":
|
|
|
|
return []string{"services"}
|
2016-12-10 03:15:44 +00:00
|
|
|
case "NodeService", "NodeServices", "ServiceNodes":
|
2015-10-14 02:18:43 +00:00
|
|
|
return []string{"nodes", "services"}
|
2016-12-10 03:15:44 +00:00
|
|
|
case "NodeCheck", "NodeChecks", "ServiceChecks", "ChecksInState":
|
2015-10-14 02:18:43 +00:00
|
|
|
return []string{"checks"}
|
2017-01-14 01:08:43 +00:00
|
|
|
case "ChecksInStateByNodeMeta", "ServiceChecksByNodeMeta":
|
|
|
|
return []string{"nodes", "checks"}
|
2015-10-14 02:18:43 +00:00
|
|
|
case "CheckServiceNodes", "NodeInfo", "NodeDump":
|
|
|
|
return []string{"nodes", "services", "checks"}
|
|
|
|
case "SessionGet", "SessionList", "NodeSessions":
|
|
|
|
return []string{"sessions"}
|
|
|
|
case "ACLGet", "ACLList":
|
|
|
|
return []string{"acls"}
|
2015-10-23 22:19:14 +00:00
|
|
|
case "Coordinates":
|
|
|
|
return []string{"coordinates"}
|
2016-02-26 20:07:43 +00:00
|
|
|
case "PreparedQueryGet", "PreparedQueryResolve", "PreparedQueryList":
|
2015-11-10 04:37:41 +00:00
|
|
|
return []string{"prepared-queries"}
|
2015-10-14 02:18:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
panic(fmt.Sprintf("Unknown method %s", method))
|
|
|
|
}
|
|
|
|
|
2015-10-13 03:12:13 +00:00
|
|
|
// getTableWatch returns a full table watch for the given table. This will panic
|
|
|
|
// if the table doesn't have a full table watch.
|
|
|
|
func (s *StateStore) getTableWatch(table string) Watch {
|
2015-09-25 19:01:46 +00:00
|
|
|
if watch, ok := s.tableWatches[table]; ok {
|
|
|
|
return watch
|
|
|
|
}
|
|
|
|
|
2015-10-13 06:01:21 +00:00
|
|
|
panic(fmt.Sprintf("Unknown watch for table %s", table))
|
2015-09-25 19:01:46 +00:00
|
|
|
}
|
|
|
|
|
2015-10-13 03:12:13 +00:00
|
|
|
// GetQueryWatch returns a watch for the given query method. This is
|
|
|
|
// used for all methods except for KV; you should call GetKVSWatch instead.
|
2015-10-14 02:18:43 +00:00
|
|
|
// This will panic if the method is unknown.
|
2015-10-13 03:12:13 +00:00
|
|
|
func (s *StateStore) GetQueryWatch(method string) Watch {
|
2015-10-14 02:18:43 +00:00
|
|
|
tables := s.getWatchTables(method)
|
|
|
|
if len(tables) == 1 {
|
|
|
|
return s.getTableWatch(tables[0])
|
2015-10-13 03:12:13 +00:00
|
|
|
}
|
|
|
|
|
2015-10-14 02:18:43 +00:00
|
|
|
var watches []Watch
|
|
|
|
for _, table := range tables {
|
|
|
|
watches = append(watches, s.getTableWatch(table))
|
|
|
|
}
|
|
|
|
return NewMultiWatch(watches...)
|
2015-10-13 03:12:13 +00:00
|
|
|
}
|
|
|
|
|
2015-09-25 19:01:46 +00:00
|
|
|
// GetKVSWatch returns a watch for the given prefix in the key value store.
|
|
|
|
func (s *StateStore) GetKVSWatch(prefix string) Watch {
|
2016-01-20 05:46:22 +00:00
|
|
|
return s.kvsWatch.NewPrefixWatch(prefix)
|
2015-09-25 19:01:46 +00:00
|
|
|
}
|