users: create cache for user lookups (#16100)

* users: create cache for user lookups

This PR introduces a global cache for OS user lookups. This should
relieve pressure on the OS domain/directory lookups, which would be
queried more now that Task API exists.

Hits are cached for 1 hour, and misses are cached for 1 minute. These
values are fairly arbitrary - we can tweak them if there is any reason to.

Closes #16010

* users: delete expired negative entry from cache
This commit is contained in:
Seth Hoenig 2023-02-09 08:37:50 -06:00 committed by GitHub
parent 0d37892024
commit af28ac1610
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 253 additions and 3 deletions

3
.changelog/16100.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
users: Added a cache for OS user lookups
```

85
helper/users/cache.go Normal file
View File

@ -0,0 +1,85 @@
package users
import (
"os/user"
"sync"
"time"
"github.com/hashicorp/nomad/lib/lang"
"oss.indeed.com/go/libtime"
)
const (
cacheTTL = 1 * time.Hour
failureTTL = 1 * time.Minute
)
type entry[T any] lang.Pair[T, time.Time]
func (e *entry[T]) expired(now time.Time, ttl time.Duration) bool {
return now.After(e.Second.Add(ttl))
}
type (
userCache map[string]*entry[*user.User]
userFailureCache map[string]*entry[error]
)
type lookupUserFunc func(string) (*user.User, error)
type cache struct {
clock libtime.Clock
lookupUser lookupUserFunc
lock sync.Mutex
users userCache
userFailures userFailureCache
}
func newCache() *cache {
return &cache{
clock: libtime.SystemClock(),
lookupUser: internalLookupUser,
users: make(userCache),
userFailures: make(userFailureCache),
}
}
func (c *cache) GetUser(username string) (*user.User, error) {
c.lock.Lock()
defer c.lock.Unlock()
// record this moment as "now" for further cache operations
now := c.clock.Now()
// first check if the user is in the cache and the entry we have
// is not yet expired
usr, exists := c.users[username]
if exists && !usr.expired(now, cacheTTL) {
return usr.First, nil
}
// next check if there was a recent failure already, so we
// avoid spamming the OS with dead user lookups
failure, exists2 := c.userFailures[username]
if exists2 {
if !failure.expired(now, failureTTL) {
return nil, failure.First
}
// may as well cleanup expired case
delete(c.userFailures, username)
}
// need to perform an OS lookup
u, err := c.lookupUser(username)
// lookup was a failure, populate the failure cache
if err != nil {
c.userFailures[username] = &entry[error]{err, now}
return nil, err
}
// lookup was a success, populate the user cache
c.users[username] = &entry[*user.User]{u, now}
return u, nil
}

153
helper/users/cache_test.go Normal file
View File

@ -0,0 +1,153 @@
//go:build unix
package users
import (
"errors"
"os/user"
"testing"
"time"
"github.com/hashicorp/nomad/ci"
"github.com/shoenig/test/must"
"oss.indeed.com/go/libtime/libtimetest"
)
func TestCache_real_hit(t *testing.T) {
ci.Parallel(t)
c := newCache()
// fresh lookup
u, err := c.GetUser("nobody")
must.NoError(t, err)
must.NotNil(t, u)
// hit again, cached value
u2, err2 := c.GetUser("nobody")
must.NoError(t, err2)
must.NotNil(t, u2)
must.True(t, u == u2) // compare pointers
}
func TestCache_real_miss(t *testing.T) {
ci.Parallel(t)
c := newCache()
// fresh lookup
u, err := c.GetUser("doesnotexist")
must.Error(t, err)
must.Nil(t, u)
// hit again, cached value
u2, err2 := c.GetUser("doesnotexist")
must.Error(t, err2)
must.Nil(t, u2)
must.True(t, err == err2) // compare pointers
}
func TestCache_mock_hit(t *testing.T) {
ci.Parallel(t)
c := newCache()
lookupCount := 0
// hijack the underlying lookup function with our own mock
c.lookupUser = func(username string) (*user.User, error) {
lookupCount++
return &user.User{Name: username}, nil
}
// hijack the clock with our own mock
t0 := time.Now()
clockCount := 0
c.clock = libtimetest.NewClockMock(t).NowMock.Set(func() time.Time {
clockCount++
switch clockCount {
case 1:
return t0
case 2:
return t0.Add(59 * time.Minute)
default:
return t0.Add(61 * time.Minute)
}
})
const username = "armon"
// initial lookup
u, err := c.GetUser(username)
must.NoError(t, err)
must.Eq(t, "armon", u.Name)
must.Eq(t, 1, lookupCount)
must.Eq(t, 1, clockCount)
// second lookup, 59 minutes after initil lookup
u2, err2 := c.GetUser(username)
must.NoError(t, err2)
must.Eq(t, "armon", u2.Name)
must.Eq(t, 1, lookupCount) // was in cache
must.Eq(t, 2, clockCount)
// third lookup, 61 minutes after initial lookup (expired)
u3, err3 := c.GetUser(username)
must.NoError(t, err3)
must.Eq(t, "armon", u3.Name)
must.Eq(t, 2, lookupCount)
must.Eq(t, 3, clockCount)
}
func TestCache_mock_miss(t *testing.T) {
ci.Parallel(t)
c := newCache()
lookupCount := 0
lookupErr := errors.New("lookup error")
// hijack the underlying lookup function with our own mock
c.lookupUser = func(username string) (*user.User, error) {
lookupCount++
return nil, lookupErr
}
// hijack the clock with our own mock
t0 := time.Now()
clockCount := 0
c.clock = libtimetest.NewClockMock(t).NowMock.Set(func() time.Time {
clockCount++
switch clockCount {
case 1:
return t0
case 2:
return t0.Add(59 * time.Second)
default:
return t0.Add(61 * time.Second)
}
})
const username = "armon"
// initial lookup
u, err := c.GetUser(username)
must.ErrorIs(t, err, lookupErr)
must.Nil(t, u)
must.Eq(t, 1, lookupCount)
must.Eq(t, 1, clockCount)
// second lookup, 59 seconds after initial (still in cache)
u2, err2 := c.GetUser(username)
must.ErrorIs(t, err2, lookupErr)
must.Nil(t, u2)
must.Eq(t, 1, lookupCount) // in cache
must.Eq(t, 2, clockCount)
// third lookup, 61 seconds after initial (expired)
u3, err3 := c.GetUser(username)
must.ErrorIs(t, err3, lookupErr)
must.Nil(t, u3)
must.Eq(t, 2, lookupCount)
must.Eq(t, 3, clockCount)
}

View File

@ -12,12 +12,21 @@ import (
"github.com/hashicorp/go-multierror" "github.com/hashicorp/go-multierror"
) )
var globalCache = newCache()
// Lookup returns the user.User entry associated with the given username.
//
// Values are cached up to 1 hour, or 1 minute for failure cases.
func Lookup(username string) (*user.User, error) {
return globalCache.GetUser(username)
}
// lock is used to serialize all user lookup at the process level, because // lock is used to serialize all user lookup at the process level, because
// some NSS implementations are not concurrency safe // some NSS implementations are not concurrency safe
var lock sync.Mutex var lock sync.Mutex
// Lookup username while holding a global process lock. // internalLookupUser username while holding a global process lock.
func Lookup(username string) (*user.User, error) { func internalLookupUser(username string) (*user.User, error) {
lock.Lock() lock.Lock()
defer lock.Unlock() defer lock.Unlock()
return user.Lookup(username) return user.Lookup(username)

View File

@ -32,7 +32,7 @@ func NobodyIDs() (uint32, uint32) {
} }
func init() { func init() {
u, err := Lookup("nobody") u, err := internalLookupUser("nobody")
if err != nil { if err != nil {
panic(fmt.Sprintf("failed to lookup nobody user: %v", err)) panic(fmt.Sprintf("failed to lookup nobody user: %v", err))
} }