open-consul/agent/consul/acl_replication_legacy.go
Matt Keeler 0c76a4389f
ACL Token Persistence and Reloading (#5328)
This PR adds two features which will be useful for operators when ACLs are in use.

1. Tokens set in configuration files are now reloadable.
2. If `acl.enable_token_persistence` is set to `true` in the configuration, tokens set via the `v1/agent/token` endpoint are now persisted to disk and loaded when the agent starts (or during configuration reload)

Note that token persistence is opt-in so our users who do not want tokens on the local disk will see no change.

Some other secondary changes:

* Refactored a bunch of places where the replication token is retrieved from the token store. This token isn't just for replicating ACLs and now it is named accordingly.
* Allowed better paths in the `v1/agent/token/` API. Instead of paths like: `v1/agent/token/acl_replication_token` the path can now be just `v1/agent/token/replication`. The old paths remain to be valid. 
* Added a couple new API functions to set tokens via the new paths. Deprecated the old ones and pointed to the new names. The names are also generally better and don't imply that what you are setting is for ACLs but rather are setting ACL tokens. There is a minor semantic difference there especially for the replication token as again, its no longer used only for ACL token/policy replication. The new functions will detect 404s and fallback to using the older token paths when talking to pre-1.4.3 agents.
* Docs updated to reflect the API additions and to show using the new endpoints.
* Updated the ACL CLI set-agent-tokens command to use the non-deprecated APIs.
2019-02-27 14:28:31 -05:00

266 lines
8.6 KiB
Go

package consul
import (
"context"
"fmt"
"sort"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/agent/structs"
)
// aclIterator simplifies the algorithm below by providing a basic iterator that
// moves through a list of ACLs and returns nil when it's exhausted. It also has
// methods for pre-sorting the ACLs being iterated over by ID, which should
// already be true, but since this is crucial for correctness and we are taking
// input from other servers, we sort to make sure.
type aclIterator struct {
acls structs.ACLs
// index is the current position of the iterator.
index int
}
// newACLIterator returns a new ACL iterator.
func newACLIterator(acls structs.ACLs) *aclIterator {
return &aclIterator{acls: acls}
}
// See sort.Interface.
func (a *aclIterator) Len() int {
return len(a.acls)
}
// See sort.Interface.
func (a *aclIterator) Swap(i, j int) {
a.acls[i], a.acls[j] = a.acls[j], a.acls[i]
}
// See sort.Interface.
func (a *aclIterator) Less(i, j int) bool {
return a.acls[i].ID < a.acls[j].ID
}
// Front returns the item at index position, or nil if the list is exhausted.
func (a *aclIterator) Front() *structs.ACL {
if a.index < len(a.acls) {
return a.acls[a.index]
}
return nil
}
// Next advances the iterator to the next index.
func (a *aclIterator) Next() {
a.index++
}
// reconcileACLs takes the local and remote ACL state, and produces a list of
// changes required in order to bring the local ACLs into sync with the remote
// ACLs. You can supply lastRemoteIndex as a hint that replication has succeeded
// up to that remote index and it will make this process more efficient by only
// comparing ACL entries modified after that index. Setting this to 0 will force
// a full compare of all existing ACLs.
func reconcileLegacyACLs(local, remote structs.ACLs, lastRemoteIndex uint64) structs.ACLRequests {
// Since sorting the lists is crucial for correctness, we are depending
// on data coming from other servers potentially running a different,
// version of Consul, and sorted-ness is kind of a subtle property of
// the state store indexing, it's prudent to make sure things are sorted
// before we begin.
localIter, remoteIter := newACLIterator(local), newACLIterator(remote)
sort.Sort(localIter)
sort.Sort(remoteIter)
// Run through both lists and reconcile them.
var changes structs.ACLRequests
for localIter.Front() != nil || remoteIter.Front() != nil {
// If the local list is exhausted, then process this as a remote
// add. We know from the loop condition that there's something
// in the remote list.
if localIter.Front() == nil {
changes = append(changes, &structs.ACLRequest{
Op: structs.ACLSet,
ACL: *(remoteIter.Front()),
})
remoteIter.Next()
continue
}
// If the remote list is exhausted, then process this as a local
// delete. We know from the loop condition that there's something
// in the local list.
if remoteIter.Front() == nil {
changes = append(changes, &structs.ACLRequest{
Op: structs.ACLDelete,
ACL: *(localIter.Front()),
})
localIter.Next()
continue
}
// At this point we know there's something at the front of each
// list we need to resolve.
// If the remote list has something local doesn't, we add it.
if localIter.Front().ID > remoteIter.Front().ID {
changes = append(changes, &structs.ACLRequest{
Op: structs.ACLSet,
ACL: *(remoteIter.Front()),
})
remoteIter.Next()
continue
}
// If local has something remote doesn't, we delete it.
if localIter.Front().ID < remoteIter.Front().ID {
changes = append(changes, &structs.ACLRequest{
Op: structs.ACLDelete,
ACL: *(localIter.Front()),
})
localIter.Next()
continue
}
// Local and remote have an ACL with the same ID, so we might
// need to compare them.
l, r := localIter.Front(), remoteIter.Front()
if r.RaftIndex.ModifyIndex > lastRemoteIndex && !r.IsSame(l) {
changes = append(changes, &structs.ACLRequest{
Op: structs.ACLSet,
ACL: *r,
})
}
localIter.Next()
remoteIter.Next()
}
return changes
}
// FetchLocalACLs returns the ACLs in the local state store.
func (s *Server) fetchLocalLegacyACLs() (structs.ACLs, error) {
_, local, err := s.fsm.State().ACLTokenList(nil, false, true, "")
if err != nil {
return nil, err
}
var acls structs.ACLs
for _, token := range local {
if acl, err := token.Convert(); err == nil && acl != nil {
acls = append(acls, acl)
}
}
return acls, nil
}
// FetchRemoteACLs is used to get the remote set of ACLs from the ACL
// datacenter. The lastIndex parameter is a hint about which remote index we
// have replicated to, so this is expected to block until something changes.
func (s *Server) fetchRemoteLegacyACLs(lastRemoteIndex uint64) (*structs.IndexedACLs, error) {
defer metrics.MeasureSince([]string{"leader", "fetchRemoteACLs"}, time.Now())
args := structs.DCSpecificRequest{
Datacenter: s.config.ACLDatacenter,
QueryOptions: structs.QueryOptions{
Token: s.tokens.ReplicationToken(),
MinQueryIndex: lastRemoteIndex,
AllowStale: true,
},
}
var remote structs.IndexedACLs
if err := s.RPC("ACL.List", &args, &remote); err != nil {
return nil, err
}
return &remote, nil
}
// UpdateLocalACLs is given a list of changes to apply in order to bring the
// local ACLs in-line with the remote ACLs from the ACL datacenter.
func (s *Server) updateLocalLegacyACLs(changes structs.ACLRequests, ctx context.Context) (bool, error) {
defer metrics.MeasureSince([]string{"leader", "updateLocalACLs"}, time.Now())
minTimePerOp := time.Second / time.Duration(s.config.ACLReplicationApplyLimit)
for _, change := range changes {
// Note that we are using the single ACL interface here and not
// performing all this inside a single transaction. This is OK
// for two reasons. First, there's nothing else other than this
// replication routine that alters the local ACLs, so there's
// nothing to contend with locally. Second, if an apply fails
// in the middle (most likely due to losing leadership), the
// next replication pass will clean up and check everything
// again.
var reply string
start := time.Now()
if err := aclApplyInternal(s, change, &reply); err != nil {
return false, err
}
// Do a smooth rate limit to wait out the min time allowed for
// each op. If this op took longer than the min, then the sleep
// time will be negative and we will just move on.
elapsed := time.Since(start)
select {
case <-ctx.Done():
return true, nil
case <-time.After(minTimePerOp - elapsed):
// do nothing
}
}
return false, nil
}
// replicateACLs is a runs one pass of the algorithm for replicating ACLs from
// a remote ACL datacenter to local state. If there's any error, this will return
// 0 for the lastRemoteIndex, which will cause us to immediately do a full sync
// next time.
func (s *Server) replicateLegacyACLs(lastRemoteIndex uint64, ctx context.Context) (uint64, bool, error) {
remote, err := s.fetchRemoteLegacyACLs(lastRemoteIndex)
if err != nil {
return 0, false, fmt.Errorf("failed to retrieve remote ACLs: %v", err)
}
// Need to check if we should be stopping. This will be common as the fetching process is a blocking
// RPC which could have been hanging around for a long time and during that time leadership could
// have been lost.
select {
case <-ctx.Done():
return 0, true, nil
default:
// do nothing
}
// Measure everything after the remote query, which can block for long
// periods of time. This metric is a good measure of how expensive the
// replication process is.
defer metrics.MeasureSince([]string{"leader", "replicateACLs"}, time.Now())
local, err := s.fetchLocalLegacyACLs()
if err != nil {
return 0, false, fmt.Errorf("failed to retrieve local ACLs: %v", err)
}
// If the remote index ever goes backwards, it's a good indication that
// the remote side was rebuilt and we should do a full sync since we
// can't make any assumptions about what's going on.
if remote.QueryMeta.Index < lastRemoteIndex {
s.logger.Printf("[WARN] consul: Legacy ACL replication remote index moved backwards (%d to %d), forcing a full ACL sync", lastRemoteIndex, remote.QueryMeta.Index)
lastRemoteIndex = 0
}
// Calculate the changes required to bring the state into sync and then
// apply them.
changes := reconcileLegacyACLs(local, remote.ACLs, lastRemoteIndex)
exit, err := s.updateLocalLegacyACLs(changes, ctx)
if exit {
return 0, true, nil
}
if err != nil {
return 0, false, fmt.Errorf("failed to sync ACL changes: %v", err)
}
// Return the index we got back from the remote side, since we've synced
// up with the remote state as of that index.
return remote.QueryMeta.Index, false, nil
}