open-consul/agent/acl.go
Frank Schröder 44e6b8122d acl: consolidate error handling (#3401)
The error handling of the ACL code relies on the presence of certain
magic error messages. Since the error values are sent via RPC between
older and newer consul agents we cannot just replace the magic values
with typed errors and switch to type checks since this would break
compatibility with older clients.

Therefore, this patch moves all magic ACL error messages into the acl
package and provides default error values and helper functions which
determine the type of error.
2017-08-23 16:52:48 +02:00

439 lines
12 KiB
Go

package agent
import (
"fmt"
"sync"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/golang-lru"
"github.com/hashicorp/serf/serf"
)
// There's enough behavior difference with client-side ACLs that we've
// intentionally kept this code separate from the server-side ACL code in
// consul/acl.go. We may refactor some of the caching logic in the future,
// but for now we are developing this separately to see how things shake out.
const (
// anonymousToken is the token ID we re-write to if there is no token ID
// provided.
anonymousToken = "anonymous"
// Maximum number of cached ACL entries.
aclCacheSize = 10 * 1024
)
// aclCacheEntry is used to cache ACL tokens.
type aclCacheEntry struct {
// ACL is the cached ACL.
ACL acl.ACL
// Expires is set based on the TTL for the ACL.
Expires time.Time
// ETag is used as an optimization when fetching ACLs from servers to
// avoid transmitting data back when the agent has a good copy, which is
// usually the case when refreshing a TTL.
ETag string
}
// aclManager is used by the agent to keep track of state related to ACLs,
// including caching tokens from the servers. This has some internal state that
// we don't want to dump into the agent itself.
type aclManager struct {
// acls is a cache mapping ACL tokens to compiled policies.
acls *lru.TwoQueueCache
// master is the ACL to use when the agent master token is supplied.
master acl.ACL
// down is the ACL to use when the servers are down. This may be nil
// which means to try and use the cached policy if there is one (or
// deny if there isn't a policy in the cache).
down acl.ACL
// disabled is used to keep track of feedback from the servers that ACLs
// are disabled. If the manager discovers that ACLs are disabled, this
// will be set to the next time we should check to see if they have been
// enabled. This helps cut useless traffic, but allows us to turn on ACL
// support at the servers without having to restart the whole cluster.
disabled time.Time
disabledLock sync.RWMutex
}
// newACLManager returns an ACL manager based on the given config.
func newACLManager(config *Config) (*aclManager, error) {
// Set up the cache from ID to ACL (we don't cache policies like the
// servers; only one level).
acls, err := lru.New2Q(aclCacheSize)
if err != nil {
return nil, err
}
// Build a policy for the agent master token.
policy := &acl.Policy{
Agents: []*acl.AgentPolicy{
&acl.AgentPolicy{
Node: config.NodeName,
Policy: acl.PolicyWrite,
},
},
Nodes: []*acl.NodePolicy{
&acl.NodePolicy{
Name: "",
Policy: acl.PolicyRead,
},
},
}
master, err := acl.New(acl.DenyAll(), policy)
if err != nil {
return nil, err
}
var down acl.ACL
switch config.ACLDownPolicy {
case "allow":
down = acl.AllowAll()
case "deny":
down = acl.DenyAll()
case "extend-cache":
// Leave the down policy as nil to signal this.
default:
return nil, fmt.Errorf("invalid ACL down policy %q", config.ACLDownPolicy)
}
// Give back a manager.
return &aclManager{
acls: acls,
master: master,
down: down,
}, nil
}
// isDisabled returns true if the manager has discovered that ACLs are disabled
// on the servers.
func (m *aclManager) isDisabled() bool {
m.disabledLock.RLock()
defer m.disabledLock.RUnlock()
return time.Now().Before(m.disabled)
}
// lookupACL attempts to locate the compiled policy associated with the given
// token. The agent may be used to perform RPC calls to the servers to fetch
// policies that aren't in the cache.
func (m *aclManager) lookupACL(a *Agent, id string) (acl.ACL, error) {
// Handle some special cases for the ID.
if len(id) == 0 {
id = anonymousToken
} else if acl.RootACL(id) != nil {
return nil, acl.ErrRootDenied
} else if a.tokens.IsAgentMasterToken(id) {
return m.master, nil
}
// Try the cache first.
var cached *aclCacheEntry
if raw, ok := m.acls.Get(id); ok {
cached = raw.(*aclCacheEntry)
}
if cached != nil && time.Now().Before(cached.Expires) {
metrics.IncrCounter([]string{"consul", "acl", "cache_hit"}, 1)
return cached.ACL, nil
}
metrics.IncrCounter([]string{"consul", "acl", "cache_miss"}, 1)
// At this point we might have a stale cached ACL, or none at all, so
// try to contact the servers.
args := structs.ACLPolicyRequest{
Datacenter: a.config.ACLDatacenter,
ACL: id,
}
if cached != nil {
args.ETag = cached.ETag
}
var reply structs.ACLPolicy
err := a.RPC("ACL.GetPolicy", &args, &reply)
if err != nil {
if acl.IsErrDisabled(err) {
a.logger.Printf("[DEBUG] agent: ACLs disabled on servers, will check again after %s", a.config.ACLDisabledTTL)
m.disabledLock.Lock()
m.disabled = time.Now().Add(a.config.ACLDisabledTTL)
m.disabledLock.Unlock()
return nil, nil
} else if acl.IsErrNotFound(err) {
return nil, acl.ErrNotFound
} else {
a.logger.Printf("[DEBUG] agent: Failed to get policy for ACL from servers: %v", err)
if m.down != nil {
return m.down, nil
} else if cached != nil {
return cached.ACL, nil
} else {
return acl.DenyAll(), nil
}
}
}
// Use the old cached compiled ACL if we can, otherwise compile it and
// resolve any parents.
var compiled acl.ACL
if cached != nil && cached.ETag == reply.ETag {
compiled = cached.ACL
} else {
parent := acl.RootACL(reply.Parent)
if parent == nil {
parent, err = m.lookupACL(a, reply.Parent)
if err != nil {
return nil, err
}
}
acl, err := acl.New(parent, reply.Policy)
if err != nil {
return nil, err
}
compiled = acl
}
// Update the cache.
cached = &aclCacheEntry{
ACL: compiled,
ETag: reply.ETag,
}
if reply.TTL > 0 {
cached.Expires = time.Now().Add(reply.TTL)
}
m.acls.Add(id, cached)
return compiled, nil
}
// resolveToken is the primary interface used by ACL-checkers in the agent
// endpoints, which is the one place where we do some ACL enforcement on
// clients. Some of the enforcement is normative (e.g. self and monitor)
// and some is informative (e.g. catalog and health).
func (a *Agent) resolveToken(id string) (acl.ACL, error) {
// Disable ACLs if version 8 enforcement isn't enabled.
if !(*a.config.ACLEnforceVersion8) {
return nil, nil
}
// Bail if there's no ACL datacenter configured. This means that agent
// enforcement isn't on.
if a.config.ACLDatacenter == "" {
return nil, nil
}
// Bail if the ACL manager is disabled. This happens if it gets feedback
// from the servers that ACLs are disabled.
if a.acls.isDisabled() {
return nil, nil
}
// This will look in the cache and fetch from the servers if necessary.
return a.acls.lookupACL(a, id)
}
// vetServiceRegister makes sure the service registration action is allowed by
// the given token.
func (a *Agent) vetServiceRegister(token string, service *structs.NodeService) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Vet the service itself.
if !rule.ServiceWrite(service.Service) {
return acl.ErrPermissionDenied
}
// Vet any service that might be getting overwritten.
services := a.state.Services()
if existing, ok := services[service.ID]; ok {
if !rule.ServiceWrite(existing.Service) {
return acl.ErrPermissionDenied
}
}
return nil
}
// vetServiceUpdate makes sure the service update action is allowed by the given
// token.
func (a *Agent) vetServiceUpdate(token string, serviceID string) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Vet any changes based on the existing services's info.
services := a.state.Services()
if existing, ok := services[serviceID]; ok {
if !rule.ServiceWrite(existing.Service) {
return acl.ErrPermissionDenied
}
} else {
return fmt.Errorf("Unknown service %q", serviceID)
}
return nil
}
// vetCheckRegister makes sure the check registration action is allowed by the
// given token.
func (a *Agent) vetCheckRegister(token string, check *structs.HealthCheck) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Vet the check itself.
if len(check.ServiceName) > 0 {
if !rule.ServiceWrite(check.ServiceName) {
return acl.ErrPermissionDenied
}
} else {
if !rule.NodeWrite(a.config.NodeName) {
return acl.ErrPermissionDenied
}
}
// Vet any check that might be getting overwritten.
checks := a.state.Checks()
if existing, ok := checks[check.CheckID]; ok {
if len(existing.ServiceName) > 0 {
if !rule.ServiceWrite(existing.ServiceName) {
return acl.ErrPermissionDenied
}
} else {
if !rule.NodeWrite(a.config.NodeName) {
return acl.ErrPermissionDenied
}
}
}
return nil
}
// vetCheckUpdate makes sure that a check update is allowed by the given token.
func (a *Agent) vetCheckUpdate(token string, checkID types.CheckID) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Vet any changes based on the existing check's info.
checks := a.state.Checks()
if existing, ok := checks[checkID]; ok {
if len(existing.ServiceName) > 0 {
if !rule.ServiceWrite(existing.ServiceName) {
return acl.ErrPermissionDenied
}
} else {
if !rule.NodeWrite(a.config.NodeName) {
return acl.ErrPermissionDenied
}
}
} else {
return fmt.Errorf("Unknown check %q", checkID)
}
return nil
}
// filterMembers redacts members that the token doesn't have access to.
func (a *Agent) filterMembers(token string, members *[]serf.Member) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Filter out members based on the node policy.
m := *members
for i := 0; i < len(m); i++ {
node := m[i].Name
if rule.NodeRead(node) {
continue
}
a.logger.Printf("[DEBUG] agent: dropping node %q from result due to ACLs", node)
m = append(m[:i], m[i+1:]...)
i--
}
*members = m
return nil
}
// filterServices redacts services that the token doesn't have access to.
func (a *Agent) filterServices(token string, services *map[string]*structs.NodeService) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Filter out services based on the service policy.
for id, service := range *services {
if rule.ServiceRead(service.Service) {
continue
}
a.logger.Printf("[DEBUG] agent: dropping service %q from result due to ACLs", id)
delete(*services, id)
}
return nil
}
// filterChecks redacts checks that the token doesn't have access to.
func (a *Agent) filterChecks(token string, checks *map[types.CheckID]*structs.HealthCheck) error {
// Resolve the token and bail if ACLs aren't enabled.
rule, err := a.resolveToken(token)
if err != nil {
return err
}
if rule == nil {
return nil
}
// Filter out checks based on the node or service policy.
for id, check := range *checks {
if len(check.ServiceName) > 0 {
if rule.ServiceRead(check.ServiceName) {
continue
}
} else {
if rule.NodeRead(a.config.NodeName) {
continue
}
}
a.logger.Printf("[DEBUG] agent: dropping check %q from result due to ACLs", id)
delete(*checks, id)
}
return nil
}