From af28ac1610d0a0d08328979291b259adeb280ce2 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Thu, 9 Feb 2023 08:37:50 -0600 Subject: [PATCH] users: create cache for user lookups (#16100) * users: create cache for user lookups This PR introduces a global cache for OS user lookups. This should relieve pressure on the OS domain/directory lookups, which would be queried more now that Task API exists. Hits are cached for 1 hour, and misses are cached for 1 minute. These values are fairly arbitrary - we can tweak them if there is any reason to. Closes #16010 * users: delete expired negative entry from cache --- .changelog/16100.txt | 3 + helper/users/cache.go | 85 ++++++++++++++++++++ helper/users/cache_test.go | 153 ++++++++++++++++++++++++++++++++++++ helper/users/lookup.go | 13 ++- helper/users/lookup_unix.go | 2 +- 5 files changed, 253 insertions(+), 3 deletions(-) create mode 100644 .changelog/16100.txt create mode 100644 helper/users/cache.go create mode 100644 helper/users/cache_test.go diff --git a/.changelog/16100.txt b/.changelog/16100.txt new file mode 100644 index 000000000..9a6081441 --- /dev/null +++ b/.changelog/16100.txt @@ -0,0 +1,3 @@ +```release-note:improvement +users: Added a cache for OS user lookups +``` diff --git a/helper/users/cache.go b/helper/users/cache.go new file mode 100644 index 000000000..0e45154a8 --- /dev/null +++ b/helper/users/cache.go @@ -0,0 +1,85 @@ +package users + +import ( + "os/user" + "sync" + "time" + + "github.com/hashicorp/nomad/lib/lang" + "oss.indeed.com/go/libtime" +) + +const ( + cacheTTL = 1 * time.Hour + failureTTL = 1 * time.Minute +) + +type entry[T any] lang.Pair[T, time.Time] + +func (e *entry[T]) expired(now time.Time, ttl time.Duration) bool { + return now.After(e.Second.Add(ttl)) +} + +type ( + userCache map[string]*entry[*user.User] + userFailureCache map[string]*entry[error] +) + +type lookupUserFunc func(string) (*user.User, error) + +type cache struct { + clock libtime.Clock + lookupUser lookupUserFunc + + lock sync.Mutex + users userCache + userFailures userFailureCache +} + +func newCache() *cache { + return &cache{ + clock: libtime.SystemClock(), + lookupUser: internalLookupUser, + users: make(userCache), + userFailures: make(userFailureCache), + } +} + +func (c *cache) GetUser(username string) (*user.User, error) { + c.lock.Lock() + defer c.lock.Unlock() + + // record this moment as "now" for further cache operations + now := c.clock.Now() + + // first check if the user is in the cache and the entry we have + // is not yet expired + usr, exists := c.users[username] + if exists && !usr.expired(now, cacheTTL) { + return usr.First, nil + } + + // next check if there was a recent failure already, so we + // avoid spamming the OS with dead user lookups + failure, exists2 := c.userFailures[username] + if exists2 { + if !failure.expired(now, failureTTL) { + return nil, failure.First + } + // may as well cleanup expired case + delete(c.userFailures, username) + } + + // need to perform an OS lookup + u, err := c.lookupUser(username) + + // lookup was a failure, populate the failure cache + if err != nil { + c.userFailures[username] = &entry[error]{err, now} + return nil, err + } + + // lookup was a success, populate the user cache + c.users[username] = &entry[*user.User]{u, now} + return u, nil +} diff --git a/helper/users/cache_test.go b/helper/users/cache_test.go new file mode 100644 index 000000000..4d651f68e --- /dev/null +++ b/helper/users/cache_test.go @@ -0,0 +1,153 @@ +//go:build unix + +package users + +import ( + "errors" + "os/user" + "testing" + "time" + + "github.com/hashicorp/nomad/ci" + "github.com/shoenig/test/must" + "oss.indeed.com/go/libtime/libtimetest" +) + +func TestCache_real_hit(t *testing.T) { + ci.Parallel(t) + + c := newCache() + + // fresh lookup + u, err := c.GetUser("nobody") + must.NoError(t, err) + must.NotNil(t, u) + + // hit again, cached value + u2, err2 := c.GetUser("nobody") + must.NoError(t, err2) + must.NotNil(t, u2) + must.True(t, u == u2) // compare pointers +} + +func TestCache_real_miss(t *testing.T) { + ci.Parallel(t) + + c := newCache() + + // fresh lookup + u, err := c.GetUser("doesnotexist") + must.Error(t, err) + must.Nil(t, u) + + // hit again, cached value + u2, err2 := c.GetUser("doesnotexist") + must.Error(t, err2) + must.Nil(t, u2) + must.True(t, err == err2) // compare pointers +} + +func TestCache_mock_hit(t *testing.T) { + ci.Parallel(t) + + c := newCache() + + lookupCount := 0 + + // hijack the underlying lookup function with our own mock + c.lookupUser = func(username string) (*user.User, error) { + lookupCount++ + return &user.User{Name: username}, nil + } + + // hijack the clock with our own mock + t0 := time.Now() + clockCount := 0 + c.clock = libtimetest.NewClockMock(t).NowMock.Set(func() time.Time { + clockCount++ + switch clockCount { + case 1: + return t0 + case 2: + return t0.Add(59 * time.Minute) + default: + return t0.Add(61 * time.Minute) + } + }) + + const username = "armon" + + // initial lookup + u, err := c.GetUser(username) + must.NoError(t, err) + must.Eq(t, "armon", u.Name) + must.Eq(t, 1, lookupCount) + must.Eq(t, 1, clockCount) + + // second lookup, 59 minutes after initil lookup + u2, err2 := c.GetUser(username) + must.NoError(t, err2) + must.Eq(t, "armon", u2.Name) + must.Eq(t, 1, lookupCount) // was in cache + must.Eq(t, 2, clockCount) + + // third lookup, 61 minutes after initial lookup (expired) + u3, err3 := c.GetUser(username) + must.NoError(t, err3) + must.Eq(t, "armon", u3.Name) + must.Eq(t, 2, lookupCount) + must.Eq(t, 3, clockCount) +} + +func TestCache_mock_miss(t *testing.T) { + ci.Parallel(t) + + c := newCache() + + lookupCount := 0 + lookupErr := errors.New("lookup error") + + // hijack the underlying lookup function with our own mock + c.lookupUser = func(username string) (*user.User, error) { + lookupCount++ + return nil, lookupErr + } + + // hijack the clock with our own mock + t0 := time.Now() + clockCount := 0 + c.clock = libtimetest.NewClockMock(t).NowMock.Set(func() time.Time { + clockCount++ + switch clockCount { + case 1: + return t0 + case 2: + return t0.Add(59 * time.Second) + default: + return t0.Add(61 * time.Second) + } + }) + + const username = "armon" + + // initial lookup + u, err := c.GetUser(username) + must.ErrorIs(t, err, lookupErr) + must.Nil(t, u) + must.Eq(t, 1, lookupCount) + must.Eq(t, 1, clockCount) + + // second lookup, 59 seconds after initial (still in cache) + u2, err2 := c.GetUser(username) + must.ErrorIs(t, err2, lookupErr) + must.Nil(t, u2) + must.Eq(t, 1, lookupCount) // in cache + must.Eq(t, 2, clockCount) + + // third lookup, 61 seconds after initial (expired) + u3, err3 := c.GetUser(username) + must.ErrorIs(t, err3, lookupErr) + must.Nil(t, u3) + must.Eq(t, 2, lookupCount) + must.Eq(t, 3, clockCount) +} diff --git a/helper/users/lookup.go b/helper/users/lookup.go index e5236993b..86b52f85a 100644 --- a/helper/users/lookup.go +++ b/helper/users/lookup.go @@ -12,12 +12,21 @@ import ( "github.com/hashicorp/go-multierror" ) +var globalCache = newCache() + +// Lookup returns the user.User entry associated with the given username. +// +// Values are cached up to 1 hour, or 1 minute for failure cases. +func Lookup(username string) (*user.User, error) { + return globalCache.GetUser(username) +} + // lock is used to serialize all user lookup at the process level, because // some NSS implementations are not concurrency safe var lock sync.Mutex -// Lookup username while holding a global process lock. -func Lookup(username string) (*user.User, error) { +// internalLookupUser username while holding a global process lock. +func internalLookupUser(username string) (*user.User, error) { lock.Lock() defer lock.Unlock() return user.Lookup(username) diff --git a/helper/users/lookup_unix.go b/helper/users/lookup_unix.go index 7dc9da81b..026d5a66b 100644 --- a/helper/users/lookup_unix.go +++ b/helper/users/lookup_unix.go @@ -32,7 +32,7 @@ func NobodyIDs() (uint32, uint32) { } func init() { - u, err := Lookup("nobody") + u, err := internalLookupUser("nobody") if err != nil { panic(fmt.Sprintf("failed to lookup nobody user: %v", err)) }