Merge pull request #4303 from pierresouchay/non_blocking_acl

Only send one single ACL cache refresh across network when TTL is over
This commit is contained in:
Matt Keeler 2018-07-10 08:57:33 -04:00 committed by GitHub
commit 22c5951ec4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 334 additions and 244 deletions

View File

@ -158,7 +158,7 @@ test: other-consul dev-build vet
@# hide it from travis as it exceeds their log limits and causes job to be
@# terminated (over 4MB and over 10k lines in the UI). We need to output
@# _something_ to stop them terminating us due to inactivity...
{ go test $(GOTEST_FLAGS) -tags '$(GOTAGS)' -timeout 5m $(GOTEST_PKGS) 2>&1 ; echo $$? > exit-code ; } | tee test.log | egrep '^(ok|FAIL)\s*github.com/hashicorp/consul'
{ go test $(GOTEST_FLAGS) -tags '$(GOTAGS)' -timeout 7m $(GOTEST_PKGS) 2>&1 ; echo $$? > exit-code ; } | tee test.log | egrep '^(ok|FAIL)\s*github.com/hashicorp/consul'
@echo "Exit code: $$(cat exit-code)" >> test.log
@# This prints all the race report between ====== lines
@awk '/^WARNING: DATA RACE/ {do_print=1; print "=================="} do_print==1 {print} /^={10,}/ {do_print=0}' test.log || true

View File

@ -103,7 +103,7 @@ func newACLManager(config *config.RuntimeConfig) (*aclManager, error) {
down = acl.AllowAll()
case "deny":
down = acl.DenyAll()
case "extend-cache":
case "async-cache", "extend-cache":
// Leave the down policy as nil to signal this.
default:
return nil, fmt.Errorf("invalid ACL down policy %q", config.ACLDownPolicy)

View File

@ -274,8 +274,10 @@ func TestACL_Down_Allow(t *testing.T) {
func TestACL_Down_Extend(t *testing.T) {
t.Parallel()
aclExtendPolicies := []string{"extend-cache", "async-cache"}
for _, aclDownPolicy := range aclExtendPolicies {
a := NewTestAgent(t.Name(), TestACLConfig()+`
acl_down_policy = "extend-cache"
acl_down_policy = "`+aclDownPolicy+`"
acl_enforce_version_8 = true
`)
defer a.Shutdown()
@ -349,6 +351,7 @@ func TestACL_Down_Extend(t *testing.T) {
t.Fatalf("should deny")
}
}
}
func TestACL_Cache(t *testing.T) {
t.Parallel()

View File

@ -94,8 +94,10 @@ type RuntimeConfig struct {
// ACL's to be used to service requests. This
// is the default. If the ACL is not in the cache,
// this acts like deny.
// * async-cache - Same behaviour as extend-cache, but perform ACL
// Lookups asynchronously when cache TTL is expired.
//
// hcl: acl_down_policy = ("allow"|"deny"|"extend-cache")
// hcl: acl_down_policy = ("allow"|"deny"|"extend-cache"|"async-cache")
ACLDownPolicy string
// ACLEnforceVersion8 is used to gate a set of ACL policy features that

View File

@ -4,6 +4,7 @@ import (
"fmt"
"log"
"os"
"sync"
"time"
"github.com/armon/go-metrics"
@ -116,6 +117,9 @@ type aclCache struct {
// local is a function used to look for an ACL locally if replication is
// enabled. This will be nil if replication isn't enabled.
local acl.FaultFunc
fetchMutex sync.RWMutex
fetchMap map[string][]chan (RemoteACLResult)
}
// newACLCache returns a new non-authoritative cache for ACLs. This is used for
@ -142,10 +146,17 @@ func newACLCache(conf *Config, logger *log.Logger, rpc rpcFn, local acl.FaultFun
if err != nil {
return nil, fmt.Errorf("Failed to create ACL policy cache: %v", err)
}
cache.fetchMap = make(map[string][]chan (RemoteACLResult))
return cache, nil
}
// Result Type returned when fetching Remote ACLs asynchronously
type RemoteACLResult struct {
result acl.ACL
err error
}
// lookupACL is used when we are non-authoritative, and need to resolve an ACL.
func (c *aclCache) lookupACL(id, authDC string) (acl.ACL, error) {
// Check the cache for the ACL.
@ -161,8 +172,23 @@ func (c *aclCache) lookupACL(id, authDC string) (acl.ACL, error) {
return cached.ACL, nil
}
metrics.IncrCounter([]string{"acl", "cache_miss"}, 1)
res := c.lookupACLRemote(id, authDC, cached)
return res.result, res.err
}
// Attempt to refresh the policy from the ACL datacenter via an RPC.
func (c *aclCache) fireResult(id string, theACL acl.ACL, err error) {
c.fetchMutex.Lock()
channels := c.fetchMap[id]
delete(c.fetchMap, id)
c.fetchMutex.Unlock()
aclResult := RemoteACLResult{theACL, err}
for _, cx := range channels {
cx <- aclResult
close(cx)
}
}
func (c *aclCache) loadACLInChan(id, authDC string, cached *aclCacheEntry) {
args := structs.ACLPolicyRequest{
Datacenter: authDC,
ACL: id,
@ -173,13 +199,21 @@ func (c *aclCache) lookupACL(id, authDC string) (acl.ACL, error) {
var reply structs.ACLPolicy
err := c.rpc("ACL.GetPolicy", &args, &reply)
if err == nil {
return c.useACLPolicy(id, authDC, cached, &reply)
theACL, theError := c.useACLPolicy(id, authDC, cached, &reply)
if cached != nil && theACL != nil {
cached.ACL = theACL
cached.ETag = reply.ETag
cached.Expires = time.Now().Add(c.config.ACLTTL)
}
c.fireResult(id, theACL, theError)
return
}
// Check for not-found, which will cause us to bail immediately. For any
// other error we report it in the logs but can continue.
if acl.IsErrNotFound(err) {
return nil, acl.ErrNotFound
c.fireResult(id, nil, acl.ErrNotFound)
return
}
c.logger.Printf("[ERR] consul.acl: Failed to get policy from ACL datacenter: %v", err)
@ -200,7 +234,7 @@ func (c *aclCache) lookupACL(id, authDC string) (acl.ACL, error) {
// local ACL fault function is registered to query replicated ACL data,
// and the user's policy allows it, we will try locally before we give
// up.
if c.local != nil && c.config.ACLDownPolicy == "extend-cache" {
if c.local != nil && (c.config.ACLDownPolicy == "extend-cache" || c.config.ACLDownPolicy == "async-cache") {
parent, rules, err := c.local(id)
if err != nil {
// We don't make an exception here for ACLs that aren't
@ -227,24 +261,58 @@ func (c *aclCache) lookupACL(id, authDC string) (acl.ACL, error) {
reply.TTL = c.config.ACLTTL
reply.Parent = parent
reply.Policy = policy
return c.useACLPolicy(id, authDC, cached, &reply)
theACL, theError := c.useACLPolicy(id, authDC, cached, &reply)
if cached != nil && theACL != nil {
cached.ACL = theACL
cached.ETag = reply.ETag
cached.Expires = time.Now().Add(c.config.ACLTTL)
}
c.fireResult(id, theACL, theError)
return
}
ACL_DOWN:
// Unable to refresh, apply the down policy.
switch c.config.ACLDownPolicy {
case "allow":
return acl.AllowAll(), nil
case "extend-cache":
c.fireResult(id, acl.AllowAll(), nil)
return
case "async-cache", "extend-cache":
if cached != nil {
return cached.ACL, nil
c.fireResult(id, cached.ACL, nil)
return
}
fallthrough
default:
return acl.DenyAll(), nil
c.fireResult(id, acl.DenyAll(), nil)
return
}
}
func (c *aclCache) lookupACLRemote(id, authDC string, cached *aclCacheEntry) RemoteACLResult {
// Attempt to refresh the policy from the ACL datacenter via an RPC.
myChan := make(chan RemoteACLResult)
mustWaitForResult := cached == nil || c.config.ACLDownPolicy != "async-cache"
c.fetchMutex.Lock()
clients, ok := c.fetchMap[id]
if !ok || clients == nil {
clients = make([]chan RemoteACLResult, 0)
}
if mustWaitForResult {
c.fetchMap[id] = append(clients, myChan)
}
c.fetchMutex.Unlock()
if !ok {
go c.loadACLInChan(id, authDC, cached)
}
if !mustWaitForResult {
return RemoteACLResult{cached.ACL, nil}
}
res := <-myChan
return res
}
// useACLPolicy handles an ACLPolicy response
func (c *aclCache) useACLPolicy(id, authDC string, cached *aclCacheEntry, p *structs.ACLPolicy) (acl.ACL, error) {
// Check if we can used the cached policy

View File

@ -508,10 +508,13 @@ func TestACL_DownPolicy_Allow(t *testing.T) {
func TestACL_DownPolicy_ExtendCache(t *testing.T) {
t.Parallel()
aclExtendPolicies := []string{"extend-cache", "async-cache"} //"async-cache"
for _, aclDownPolicy := range aclExtendPolicies {
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.ACLDatacenter = "dc1"
c.ACLTTL = 0
c.ACLDownPolicy = "extend-cache"
c.ACLDownPolicy = aclDownPolicy
c.ACLMasterToken = "root"
})
defer os.RemoveAll(dir1)
@ -522,7 +525,7 @@ func TestACL_DownPolicy_ExtendCache(t *testing.T) {
dir2, s2 := testServerWithConfig(t, func(c *Config) {
c.ACLDatacenter = "dc1" // Enable ACLs!
c.ACLTTL = 0
c.ACLDownPolicy = "extend-cache"
c.ACLDownPolicy = aclDownPolicy
c.Bootstrap = false // Disable bootstrap
})
defer os.RemoveAll(dir2)
@ -582,9 +585,13 @@ func TestACL_DownPolicy_ExtendCache(t *testing.T) {
t.Fatalf("bad acl: %#v", aclR)
}
}
}
func TestACL_Replication(t *testing.T) {
t.Parallel()
aclExtendPolicies := []string{"extend-cache", "async-cache"} //"async-cache"
for _, aclDownPolicy := range aclExtendPolicies {
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.ACLDatacenter = "dc1"
c.ACLMasterToken = "root"
@ -598,7 +605,7 @@ func TestACL_Replication(t *testing.T) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLDefaultPolicy = "deny"
c.ACLDownPolicy = "extend-cache"
c.ACLDownPolicy = aclDownPolicy
c.EnableACLReplication = true
c.ACLReplicationInterval = 10 * time.Millisecond
c.ACLReplicationApplyLimit = 1000000
@ -697,6 +704,7 @@ func TestACL_Replication(t *testing.T) {
t.Fatalf("unexpected read")
}
}
}
func TestACL_MultiDC_Found(t *testing.T) {
t.Parallel()

View File

@ -235,8 +235,9 @@ type Config struct {
// ACLDownPolicy controls the behavior of ACLs if the ACLDatacenter
// cannot be contacted. It can be either "deny" to deny all requests,
// or "extend-cache" which ignores the ACLCacheInterval and uses
// cached policies. If a policy is not in the cache, it acts like deny.
// "extend-cache" or "async-cache" which ignores the ACLCacheInterval and
// uses cached policies.
// If a policy is not in the cache, it acts like deny.
// "allow" can be used to allow all requests. This is not recommended.
ACLDownPolicy string
@ -378,7 +379,7 @@ func (c *Config) CheckACL() error {
switch c.ACLDownPolicy {
case "allow":
case "deny":
case "extend-cache":
case "async-cache", "extend-cache":
default:
return fmt.Errorf("Unsupported down ACL policy: %s", c.ACLDownPolicy)
}

View File

@ -496,11 +496,14 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
to enable ACL support.
* <a name="acl_down_policy"></a><a href="#acl_down_policy">`acl_down_policy`</a> - Either
"allow", "deny" or "extend-cache"; "extend-cache" is the default. In the case that the
"allow", "deny", "extend-cache" or "async-cache"; "extend-cache" is the default. In the case that the
policy for a token cannot be read from the [`acl_datacenter`](#acl_datacenter) or leader
node, the down policy is applied. In "allow" mode, all actions are permitted, "deny" restricts
all operations, and "extend-cache" allows any cached ACLs to be used, ignoring their TTL
values. If a non-cached ACL is used, "extend-cache" acts like "deny".
The value "async-cache" acts the same way as "extend-cache" but performs updates
asynchronously when ACL is present but its TTL is expired, thus, if latency is bad between
ACL authoritative and other datacenters, latency of operations is not impacted.
* <a name="acl_agent_master_token"></a><a href="#acl_agent_master_token">`acl_agent_master_token`</a> -
Used to access <a href="/api/agent.html">agent endpoints</a> that require agent read

View File

@ -1061,6 +1061,11 @@ and the [`acl_down_policy`](/docs/agent/options.html#acl_down_policy)
is set to "extend-cache", tokens will be resolved during the outage using the
replicated set of ACLs. An [ACL replication status](/api/acl.html#acl_replication_status)
endpoint is available to monitor the health of the replication process.
Also note that in recent versions of Consul (greater than 1.2.0), using
`acl_down_policy = "async-cache"` refreshes token asynchronously when an ACL is
already cached and is expired while similar semantics than "extend-cache".
It allows to avoid having issues when connectivity with the authoritative is not completely
broken, but very slow.
Locally-resolved ACLs will be cached using the [`acl_ttl`](/docs/agent/options.html#acl_ttl)
setting of the non-authoritative datacenter, so these entries may persist in the