open-nomad/client/acl.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package client

import (
	"time"

	"github.com/armon/go-metrics"
	"github.com/hashicorp/go-set"
	"github.com/hashicorp/nomad/acl"
	"github.com/hashicorp/nomad/nomad/structs"
)

const (
	// policyCacheSize is the number of ACL policies to keep cached. Policies have a fetching cost
	// so we keep the hot policies cached to reduce the ACL token resolution time.
	policyCacheSize = 64

	// aclCacheSize is the number of ACL objects to keep cached. ACLs have a parsing and
	// construction cost, so we keep the hot objects cached to reduce the ACL token resolution time.
	aclCacheSize = 64

	// tokenCacheSize is the number of bearer tokens, ACL and workload identity,
	// to keep cached. Tokens have a fetching cost, so we keep the hot tokens
	// cached to reduce the lookups.
	tokenCacheSize = 128

	// roleCacheSize is the number of ACL roles to keep cached. Looking up
	// roles requires an RPC call, so we keep the hot roles cached to reduce
	// the number of lookups.
	roleCacheSize = 64
)

// clientACLResolver holds the state required for client resolution
// of ACLs
type clientACLResolver struct {
	// aclCache is used to maintain the parsed ACL objects
	aclCache *structs.ACLCache[*acl.ACL]

	// policyCache is used to maintain the fetched policy objects
	policyCache *structs.ACLCache[*structs.ACLPolicy]

	// tokenCache is used to maintain the fetched token objects
	tokenCache *structs.ACLCache[*structs.AuthenticatedIdentity]

	// roleCache is used to maintain a cache of the fetched ACL roles. Each
	// entry is keyed by the role ID.
	roleCache *structs.ACLCache[*structs.ACLRole]
}

// init is used to setup the client resolver state
func (c *clientACLResolver) init() {
	c.aclCache = structs.NewACLCache[*acl.ACL](aclCacheSize)
	c.policyCache = structs.NewACLCache[*structs.ACLPolicy](policyCacheSize)
	c.tokenCache = structs.NewACLCache[*structs.AuthenticatedIdentity](tokenCacheSize)
	c.roleCache = structs.NewACLCache[*structs.ACLRole](roleCacheSize)
}

// ResolveToken is used to translate an ACL Token Secret ID or workload
// identity into an ACL object, nil if ACLs are disabled, or an error.
func (c *Client) ResolveToken(bearerToken string) (*acl.ACL, error) {
	a, _, err := c.resolveTokenAndACL(bearerToken)
	return a, err
}

func (c *Client) resolveTokenAndACL(bearerToken string) (*acl.ACL, *structs.AuthenticatedIdentity, error) {
	// Fast-path if ACLs are disabled
	if !c.GetConfig().ACLEnabled {
		return nil, nil, nil
	}
	defer metrics.MeasureSince([]string{"client", "acl", "resolve_token"}, time.Now())

	// Resolve the token value
	ident, err := c.resolveTokenValue(bearerToken)
	if err != nil {
		return nil, nil, err
	}

	// Only allow ACLs and workload identities to call client RPCs
	if ident.ACLToken == nil && ident.Claims == nil {
		return nil, nil, structs.ErrTokenNotFound
	}

	// Give the token expiry some slight leeway in case the client and server
	// clocks are skewed.
	if ident.IsExpired(time.Now().Add(2 * time.Second)) {
		return nil, nil, structs.ErrTokenExpired
	}

	var policies []*structs.ACLPolicy

	// Resolve token policies
	if token := ident.ACLToken; token != nil {
		// Check if this is a management token
		if ident.ACLToken.Type == structs.ACLManagementToken {
			return acl.ManagementACL, ident, nil
		}

		// Resolve the policy links within the token ACL roles.
		policyNames, err := c.resolveTokenACLRoles(bearerToken, token.Roles)
		if err != nil {
			return nil, nil, err
		}

		// Generate a slice of all policy names included within the token, taken
		// from both the ACL roles and the direct assignments.
		policyNames = append(policyNames, token.Policies...)

		// Resolve ACL token policies
		if policies, err = c.resolvePolicies(token.SecretID, policyNames); err != nil {
			return nil, nil, err
		}
	} else {
		// Resolve policies for workload identities
		policyArgs := structs.GenericRequest{
			QueryOptions: structs.QueryOptions{
				AuthToken: bearerToken,
				Region:    c.Region(),
			},
		}
		policyReply := structs.ACLPolicySetResponse{}
		if err := c.RPC("ACL.GetClaimPolicies", &policyArgs, &policyReply); err != nil {
			return nil, nil, err
		}
		policies = make([]*structs.ACLPolicy, 0, len(policyReply.Policies))
		for _, p := range policyReply.Policies {
			policies = append(policies, p)
		}
	}

	// Resolve the ACL object
	aclObj, err := structs.CompileACLObject(c.aclCache, policies)
	if err != nil {
		return nil, nil, err
	}
	return aclObj, ident, nil
}

// resolveTokenValue is used to translate a bearer token, either an ACL token's
// secret or a workload identity, into an ACL token with caching We use a local
// cache up to the TTL limit, and then resolve via a server. If we cannot
// reach a server, but have a cached value we extend the TTL to gracefully handle outages.
func (c *Client) resolveTokenValue(bearerToken string) (*structs.AuthenticatedIdentity, error) {
	// Hot-path the anonymous token
	if bearerToken == "" {
		return &structs.AuthenticatedIdentity{ACLToken: structs.AnonymousACLToken}, nil
	}

	// Lookup the token entry in the cache
	entry, ok := c.tokenCache.Get(bearerToken)
	if ok {
		if entry.Age() <= c.GetConfig().ACLTokenTTL {
			return entry.Get(), nil
		}
	}

	// Lookup the token
	req := structs.GenericRequest{
		QueryOptions: structs.QueryOptions{
			AuthToken:  bearerToken,
			Region:     c.Region(),
			AllowStale: true,
		},
	}
	var resp structs.ACLWhoAmIResponse
	if err := c.RPC("ACL.WhoAmI", &req, &resp); err != nil {
		// If we encounter an error but have a cached value, mask the error and extend the cache
		if ok {
			c.logger.Warn("failed to resolve token, using expired cached value", "error", err)
			return entry.Get(), nil
		}
		return nil, err
	}

	// Cache the response (positive or negative)
	c.tokenCache.Add(bearerToken, resp.Identity)
	return resp.Identity, nil
}

// resolvePolicies is used to translate a set of named ACL policies into the objects.
// We cache the policies locally, and fault them from a server as necessary. Policies
// are cached for a TTL, and then refreshed. If a server cannot be reached, the cache TTL
// will be ignored to gracefully handle outages.
func (c *Client) resolvePolicies(secretID string, policies []string) ([]*structs.ACLPolicy, error) {
	var out []*structs.ACLPolicy
	var expired []*structs.ACLPolicy
	var missing []string

	// Scan the cache for each policy
	for _, policyName := range policies {
		// Lookup the policy in the cache
		entry, ok := c.policyCache.Get(policyName)
		if !ok {
			missing = append(missing, policyName)
			continue
		}

		// Check if the cached value is valid or expired
		if entry.Age() <= c.GetConfig().ACLPolicyTTL {
			out = append(out, entry.Get())
		} else {
			expired = append(expired, entry.Get())
		}
	}

	// Hot-path if we have no missing or expired policies
	if len(missing)+len(expired) == 0 {
		return out, nil
	}

	// Lookup the missing and expired policies
	fetch := missing
	for _, p := range expired {
		fetch = append(fetch, p.Name)
	}
	req := structs.ACLPolicySetRequest{
		Names: fetch,
		QueryOptions: structs.QueryOptions{
			Region:     c.Region(),
			AuthToken:  secretID,
			AllowStale: true,
		},
	}
	var resp structs.ACLPolicySetResponse
	if err := c.RPC("ACL.GetPolicies", &req, &resp); err != nil {
		// If we encounter an error but have cached policies, mask the error and extend the cache
		if len(missing) == 0 {
			c.logger.Warn("failed to resolve policies, using expired cached value", "error", err)
			out = append(out, expired...)
			return out, nil
		}
		return nil, err
	}

	// Handle each output
	for _, policy := range resp.Policies {
		c.policyCache.Add(policy.Name, policy)
		out = append(out, policy)
	}

	// Return the valid policies
	return out, nil
}

// resolveTokenACLRoles is used to unpack an ACL roles and their policy
// assignments into a list of ACL policy names. This can then be used to
// compile an ACL object.
//
// When roles need to be looked up from state via server RPC, we may use the
// expired cache version. This can only occur if we can fully resolve the role
// via the cache.
func (c *Client) resolveTokenACLRoles(secretID string, roleLinks []*structs.ACLTokenRoleLink) ([]string, error) {

	var (
		// missingRoleIDs are the roles linked which are not found within our
		// cache. These must be looked up from the server via and RPC, so we
		// can correctly identify the policy links.
		missingRoleIDs []string

		// expiredRoleIDs are the roles linked which have been found within our
		// cache, but are expired. These must be looked up from the server via
		// and RPC, so we can correctly identify the policy links.
		expiredRoleIDs []string
	)

	// policyNames tracks the resolved ACL policies which are linked to the
	// role as a deduplicated list. This is the output object and represents
	// the authorisation this role provides token bearers.
	policyNames := set.New[string](0)

	for _, roleLink := range roleLinks {

		// Look within the cache to see if the role is already present. If we
		// do not find it, add the ID to our tracking, so we look this up via
		// RPC.
		entry, ok := c.roleCache.Get(roleLink.ID)
		if !ok {
			missingRoleIDs = append(missingRoleIDs, roleLink.ID)
			continue
		}

		// If the cached value is expired, add the ID to our tracking, so we
		// look this up via RPC. Otherwise, iterate the policy links and add
		// each policy name to our return object tracking.
		if entry.Age() <= c.GetConfig().ACLRoleTTL {
			for _, policyLink := range entry.Get().Policies {
				policyNames.Insert(policyLink.Name)
			}
		} else {
			expiredRoleIDs = append(expiredRoleIDs, entry.Get().ID)
		}
	}

	// Hot-path: we were able to resolve all ACL roles via the cache and
	// generate a list of linked policy names. Therefore, we can avoid making
	// any RPC calls.
	if len(missingRoleIDs)+len(expiredRoleIDs) == 0 {
		return policyNames.Slice(), nil
	}

	// Created a combined list of role IDs that we need to lookup from server
	// state.
	roleIDsToFetch := missingRoleIDs
	roleIDsToFetch = append(roleIDsToFetch, expiredRoleIDs...)

	// Generate an RPC request to detail all the ACL roles that we did not find
	// or were expired within the cache.
	roleByIDReq := structs.ACLRolesByIDRequest{
		ACLRoleIDs: roleIDsToFetch,
		QueryOptions: structs.QueryOptions{
			Region:     c.Region(),
			AuthToken:  secretID,
			AllowStale: true,
		},
	}

	var roleByIDResp structs.ACLRolesByIDResponse

	// Perform the RPC call to detail the required ACL roles. If the RPC call
	// fails, and we are only updating expired cache entries, use the expired
	// entries. This allows use to handle intermittent failures.
	err := c.RPC(structs.ACLGetRolesByIDRPCMethod, &roleByIDReq, &roleByIDResp)
	if err != nil {
		if len(missingRoleIDs) == 0 {
			c.logger.Warn("failed to resolve ACL roles, using expired cached value", "error", err)
			for _, aclRole := range roleByIDResp.ACLRoles {
				for _, rolePolicyLink := range aclRole.Policies {
					policyNames.Insert(rolePolicyLink.Name)
				}
			}
			return policyNames.Slice(), nil
		}
		return nil, err
	}

	// Generate a timestamp for the cache entry. We do not need to use a
	// timestamp per ACL role response integration.
	now := time.Now()
	for _, aclRole := range roleByIDResp.ACLRoles {
		// Add an entry to the cache using the generated timestamp for future
		// expiry calculations. Any existing, expired entry will be
		// overwritten.
		c.roleCache.AddAtTime(aclRole.ID, aclRole, now)

		// Iterate the role policy links, extracting the name and adding this
		// to our return response tracking.
		for _, rolePolicyLink := range aclRole.Policies {
			policyNames.Insert(rolePolicyLink.Name)
		}
	}

	return policyNames.Slice(), nil
}