open-vault/helper/fairshare/jobmanager.go

242 lines
5.8 KiB
Go
Raw Normal View History

VAULT-1401 and 1402 - preliminary fair sharing (#1701) (#10917) * basic pool and start testing * refactor a bit for testing * workFunc, start/stop safety, testing * cleanup function for worker quit, more tests * redo public/private members * improve tests, export types, switch uuid package * fix loop capture bug, cleanup * cleanup tests * update worker pool file name, other improvements * add job manager prototype * remove remnants * add functions to wait for job manager and worker pool to stop, other fixes * test job manager functionality, fix bugs * encapsulate how jobs are distributed to workers * make worker job channel read only * add job interface, more testing, fixes * set name for dispatcher * fix test races * dispatcher and job manager constructors don't return errors * logger now dependency injected * make some members private, test fcn to get worker pool size * make GetNumWorkers public * Update helper/fairshare/jobmanager_test.go Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> * make workerpool private * remove custom worker names * concurrency improvements * remove worker pool cleanup function * remove cleanup func from job manager, remove non blocking stop from fairshare * stop fairshare when started in tests * stop leaking job manager goroutine * prototype channel for waking up to assign work * fix typo/bug and add tests * improve job manager wake up, fix test typo * put channel drain back * better start/pause test for job manager * go mod vendor Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-12 21:51:52 +00:00
package fairshare
import (
"container/list"
"fmt"
"io/ioutil"
"sync"
log "github.com/hashicorp/go-hclog"
uuid "github.com/hashicorp/go-uuid"
"github.com/hashicorp/vault/sdk/helper/logging"
)
/*
Future Work:
- track workers per queue. this will involve things like:
- somehow wrap the Execute/OnFailure functions to increment counter when
they start running, and decrement when they stop running
-- put a queue.IncrementCounter() call at the beginning
-- call the provided work function in the middle
-- put a queue.DecrementCounter() call at the end
- job has a queueID or reference to the queue
- queue only removed when empty AND no workers
*/
type JobManager struct {
name string
queues map[string]*list.List
queuesIndex []string
lastQueueAccessed int
quit chan struct{}
newWork chan struct{} // must be buffered
workerPool *dispatcher
onceStart sync.Once
onceStop sync.Once
logger log.Logger
// waitgroup for testing stop functionality
wg sync.WaitGroup
VAULT-1401 and 1402 - preliminary fair sharing (#1701) (#10917) * basic pool and start testing * refactor a bit for testing * workFunc, start/stop safety, testing * cleanup function for worker quit, more tests * redo public/private members * improve tests, export types, switch uuid package * fix loop capture bug, cleanup * cleanup tests * update worker pool file name, other improvements * add job manager prototype * remove remnants * add functions to wait for job manager and worker pool to stop, other fixes * test job manager functionality, fix bugs * encapsulate how jobs are distributed to workers * make worker job channel read only * add job interface, more testing, fixes * set name for dispatcher * fix test races * dispatcher and job manager constructors don't return errors * logger now dependency injected * make some members private, test fcn to get worker pool size * make GetNumWorkers public * Update helper/fairshare/jobmanager_test.go Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> * make workerpool private * remove custom worker names * concurrency improvements * remove worker pool cleanup function * remove cleanup func from job manager, remove non blocking stop from fairshare * stop fairshare when started in tests * stop leaking job manager goroutine * prototype channel for waking up to assign work * fix typo/bug and add tests * improve job manager wake up, fix test typo * put channel drain back * better start/pause test for job manager * go mod vendor Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-12 21:51:52 +00:00
// protects `queues`, `queuesIndex`, `lastQueueAccessed`
l sync.RWMutex
}
// NewJobManager creates a job manager, with an optional name
func NewJobManager(name string, numWorkers int, l log.Logger) *JobManager {
if l == nil {
l = logging.NewVaultLoggerWithWriter(ioutil.Discard, log.NoLevel)
}
if name == "" {
guid, err := uuid.GenerateUUID()
if err != nil {
l.Warn("uuid generator failed, using 'no-uuid'", "err", err)
guid = "no-uuid"
}
name = fmt.Sprintf("jobmanager-%s", guid)
}
wp := newDispatcher(fmt.Sprintf("%s-dispatcher", name), numWorkers, l)
j := JobManager{
name: name,
queues: make(map[string]*list.List),
queuesIndex: make([]string, 0),
lastQueueAccessed: -1,
quit: make(chan struct{}),
newWork: make(chan struct{}, 1),
workerPool: wp,
logger: l,
}
j.logger.Trace("created job manager", "name", name, "pool_size", numWorkers)
return &j
}
// Start starts the job manager
// note: a given job manager cannot be restarted after it has been stopped
func (j *JobManager) Start() {
j.onceStart.Do(func() {
j.logger.Trace("starting job manager", "name", j.name)
j.workerPool.start()
j.assignWork()
})
}
// Stop stops the job manager asynchronously
VAULT-1401 and 1402 - preliminary fair sharing (#1701) (#10917) * basic pool and start testing * refactor a bit for testing * workFunc, start/stop safety, testing * cleanup function for worker quit, more tests * redo public/private members * improve tests, export types, switch uuid package * fix loop capture bug, cleanup * cleanup tests * update worker pool file name, other improvements * add job manager prototype * remove remnants * add functions to wait for job manager and worker pool to stop, other fixes * test job manager functionality, fix bugs * encapsulate how jobs are distributed to workers * make worker job channel read only * add job interface, more testing, fixes * set name for dispatcher * fix test races * dispatcher and job manager constructors don't return errors * logger now dependency injected * make some members private, test fcn to get worker pool size * make GetNumWorkers public * Update helper/fairshare/jobmanager_test.go Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> * make workerpool private * remove custom worker names * concurrency improvements * remove worker pool cleanup function * remove cleanup func from job manager, remove non blocking stop from fairshare * stop fairshare when started in tests * stop leaking job manager goroutine * prototype channel for waking up to assign work * fix typo/bug and add tests * improve job manager wake up, fix test typo * put channel drain back * better start/pause test for job manager * go mod vendor Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-12 21:51:52 +00:00
func (j *JobManager) Stop() {
j.onceStop.Do(func() {
j.logger.Trace("terminating job manager...")
VAULT-1401 and 1402 - preliminary fair sharing (#1701) (#10917) * basic pool and start testing * refactor a bit for testing * workFunc, start/stop safety, testing * cleanup function for worker quit, more tests * redo public/private members * improve tests, export types, switch uuid package * fix loop capture bug, cleanup * cleanup tests * update worker pool file name, other improvements * add job manager prototype * remove remnants * add functions to wait for job manager and worker pool to stop, other fixes * test job manager functionality, fix bugs * encapsulate how jobs are distributed to workers * make worker job channel read only * add job interface, more testing, fixes * set name for dispatcher * fix test races * dispatcher and job manager constructors don't return errors * logger now dependency injected * make some members private, test fcn to get worker pool size * make GetNumWorkers public * Update helper/fairshare/jobmanager_test.go Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> * make workerpool private * remove custom worker names * concurrency improvements * remove worker pool cleanup function * remove cleanup func from job manager, remove non blocking stop from fairshare * stop fairshare when started in tests * stop leaking job manager goroutine * prototype channel for waking up to assign work * fix typo/bug and add tests * improve job manager wake up, fix test typo * put channel drain back * better start/pause test for job manager * go mod vendor Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-12 21:51:52 +00:00
close(j.quit)
j.workerPool.stop()
VAULT-1401 and 1402 - preliminary fair sharing (#1701) (#10917) * basic pool and start testing * refactor a bit for testing * workFunc, start/stop safety, testing * cleanup function for worker quit, more tests * redo public/private members * improve tests, export types, switch uuid package * fix loop capture bug, cleanup * cleanup tests * update worker pool file name, other improvements * add job manager prototype * remove remnants * add functions to wait for job manager and worker pool to stop, other fixes * test job manager functionality, fix bugs * encapsulate how jobs are distributed to workers * make worker job channel read only * add job interface, more testing, fixes * set name for dispatcher * fix test races * dispatcher and job manager constructors don't return errors * logger now dependency injected * make some members private, test fcn to get worker pool size * make GetNumWorkers public * Update helper/fairshare/jobmanager_test.go Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> * make workerpool private * remove custom worker names * concurrency improvements * remove worker pool cleanup function * remove cleanup func from job manager, remove non blocking stop from fairshare * stop fairshare when started in tests * stop leaking job manager goroutine * prototype channel for waking up to assign work * fix typo/bug and add tests * improve job manager wake up, fix test typo * put channel drain back * better start/pause test for job manager * go mod vendor Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com> Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-12 21:51:52 +00:00
})
}
// AddJob adds a job to the given queue, creating the queue if it doesn't exist
func (j *JobManager) AddJob(job Job, queueID string) {
j.l.Lock()
if len(j.queues) == 0 {
defer func() {
// newWork must be buffered to avoid deadlocks if work is added
// before the job manager is started
j.newWork <- struct{}{}
}()
}
defer j.l.Unlock()
if _, ok := j.queues[queueID]; !ok {
j.addQueue(queueID)
}
j.queues[queueID].PushBack(job)
}
// GetCurrentJobCount returns the total number of pending jobs in the job manager
func (j *JobManager) GetPendingJobCount() int {
j.l.RLock()
defer j.l.RUnlock()
cnt := 0
for _, q := range j.queues {
cnt += q.Len()
}
return cnt
}
// GetWorkerCounts() returns a map of queue ID to number of active workers
func (j *JobManager) GetWorkerCounts() map[string]int {
// TODO implement with VLT-145
return nil
}
// GetWorkQueueLengths() returns a map of queue ID to number of active workers
func (j *JobManager) GetWorkQueueLengths() map[string]int {
out := make(map[string]int)
j.l.RLock()
defer j.l.RUnlock()
for k, v := range j.queues {
out[k] = v.Len()
}
return out
}
// getNextJob grabs the next job to be processed and prunes empty queues
func (j *JobManager) getNextJob() Job {
j.l.Lock()
defer j.l.Unlock()
if len(j.queues) == 0 {
return nil
}
j.lastQueueAccessed = (j.lastQueueAccessed + 1) % len(j.queuesIndex)
queueID := j.queuesIndex[j.lastQueueAccessed]
jobElement := j.queues[queueID].Front()
out := j.queues[queueID].Remove(jobElement)
if j.queues[queueID].Len() == 0 {
j.removeLastQueueAccessed()
}
return out.(Job)
}
// assignWork continually loops checks for new jobs and dispatches them to the
// worker pool
func (j *JobManager) assignWork() {
j.wg.Add(1)
go func() {
for {
for {
// assign work while there are jobs to distribute
select {
case <-j.quit:
j.wg.Done()
return
case <-j.newWork:
// keep the channel empty since we're already processing work
default:
}
job := j.getNextJob()
if job != nil {
j.workerPool.dispatch(job)
} else {
break
}
}
// listen for a wake-up when an emtpy job manager has been given
// new work
select {
case <-j.quit:
j.wg.Done()
return
case <-j.newWork:
break
}
}
}()
}
// addQueue generates a new queue if a queue for `queueID` doesn't exist
// note: this must be called with l held for write
func (j *JobManager) addQueue(queueID string) {
if _, ok := j.queues[queueID]; !ok {
j.queues[queueID] = list.New()
j.queuesIndex = append(j.queuesIndex, queueID)
}
}
// removeLastQueueAccessed removes the queue and index map for the last queue
// accessed. It is to be used when the last queue accessed has emptied.
// note: this must be called with l held for write
func (j *JobManager) removeLastQueueAccessed() {
if j.lastQueueAccessed == -1 || j.lastQueueAccessed > len(j.queuesIndex)-1 {
j.logger.Warn("call to remove queue out of bounds", "idx", j.lastQueueAccessed)
return
}
queueID := j.queuesIndex[j.lastQueueAccessed]
// remove the queue
delete(j.queues, queueID)
// remove the index for the queue
j.queuesIndex = append(j.queuesIndex[:j.lastQueueAccessed], j.queuesIndex[j.lastQueueAccessed+1:]...)
// correct the last queue accessed for round robining
if j.lastQueueAccessed > 0 {
j.lastQueueAccessed--
} else {
j.lastQueueAccessed = len(j.queuesIndex) - 1
}
}