open-vault/vault/rollback.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package vault

import (
	"context"
	"errors"
	"fmt"
	"os"
	"strconv"
	"strings"
	"sync"
	"time"

	metrics "github.com/armon/go-metrics"
	"github.com/gammazero/workerpool"
	log "github.com/hashicorp/go-hclog"
	"github.com/hashicorp/vault/helper/namespace"
	"github.com/hashicorp/vault/sdk/logical"
)

const (
	RollbackDefaultNumWorkers = 256
	RollbackWorkersEnvVar     = "VAULT_ROLLBACK_WORKERS"
)

// RollbackManager is responsible for performing rollbacks of partial
// secrets within logical backends.
//
// During normal operations, it is possible for logical backends to
// error partially through an operation. These are called "partial secrets":
// they are never sent back to a user, but they do need to be cleaned up.
// This manager handles that by periodically (on a timer) requesting that the
// backends clean up.
//
// The RollbackManager periodically initiates a logical.RollbackOperation
// on every mounted logical backend. It ensures that only one rollback operation
// is in-flight at any given time within a single seal/unseal phase.
type RollbackManager struct {
	logger log.Logger

	// This gives the current mount table of both logical and credential backends,
	// plus a RWMutex that is locked for reading. It is up to the caller to RUnlock
	// it when done with the mount table.
	backends func() []*MountEntry

	router *Router
	period time.Duration

	inflightAll  sync.WaitGroup
	inflight     map[string]*rollbackState
	inflightLock sync.RWMutex

	doneCh          chan struct{}
	shutdown        bool
	shutdownCh      chan struct{}
	shutdownLock    sync.Mutex
	stopTicker      chan struct{}
	tickerIsStopped bool
	quitContext     context.Context
	runner          *workerpool.WorkerPool
	core            *Core
	// This channel is used for testing
	rollbacksDoneCh chan struct{}
}

// rollbackState is used to track the state of a single rollback attempt
type rollbackState struct {
	lastError error
	sync.WaitGroup
	cancelLockGrabCtx       context.Context
	cancelLockGrabCtxCancel context.CancelFunc
	// scheduled is the time that this job was created and submitted to the
	// rollbackRunner
	scheduled time.Time
}

// NewRollbackManager is used to create a new rollback manager
func NewRollbackManager(ctx context.Context, logger log.Logger, backendsFunc func() []*MountEntry, router *Router, core *Core) *RollbackManager {
	r := &RollbackManager{
		logger:      logger,
		backends:    backendsFunc,
		router:      router,
		period:      core.rollbackPeriod,
		inflight:    make(map[string]*rollbackState),
		doneCh:      make(chan struct{}),
		shutdownCh:  make(chan struct{}),
		stopTicker:  make(chan struct{}),
		quitContext: ctx,
		core:        core,
	}
	numWorkers := r.numRollbackWorkers()
	r.logger.Info(fmt.Sprintf("Starting the rollback manager with %d workers", numWorkers))
	r.runner = workerpool.New(numWorkers)
	return r
}

func (m *RollbackManager) numRollbackWorkers() int {
	numWorkers := m.core.numRollbackWorkers
	envOverride := os.Getenv(RollbackWorkersEnvVar)
	if envOverride != "" {
		envVarWorkers, err := strconv.Atoi(envOverride)
		if err != nil || envVarWorkers < 1 {
			m.logger.Warn(fmt.Sprintf("%s must be a positive integer, but was %s", RollbackWorkersEnvVar, envOverride))
		} else {
			numWorkers = envVarWorkers
		}
	}
	return numWorkers
}

// Start starts the rollback manager
func (m *RollbackManager) Start() {
	go m.run()
}

// Stop stops the running manager. This will wait for any in-flight
// rollbacks to complete.
func (m *RollbackManager) Stop() {
	m.shutdownLock.Lock()
	defer m.shutdownLock.Unlock()
	if !m.shutdown {
		m.shutdown = true
		close(m.shutdownCh)
		<-m.doneCh
	}
	m.runner.StopWait()
}

// StopTicker stops the automatic Rollback manager's ticker, causing us
// to not do automatic rollbacks. This is useful for testing plugin's
// periodic function's behavior, without trying to race against the
// rollback manager proper.
//
// THIS SHOULD ONLY BE CALLED FROM TEST HELPERS.
func (m *RollbackManager) StopTicker() {
	if !m.tickerIsStopped {
		close(m.stopTicker)
		m.tickerIsStopped = true
	}
}

// run is a long running routine to periodically invoke rollback
func (m *RollbackManager) run() {
	m.logger.Info("starting rollback manager")
	tick := time.NewTicker(m.period)
	logTestStopOnce := false
	defer tick.Stop()
	defer close(m.doneCh)
	for {
		select {
		case <-tick.C:
			m.triggerRollbacks()

		case <-m.shutdownCh:
			m.logger.Info("stopping rollback manager")
			return

		case <-m.stopTicker:
			if !logTestStopOnce {
				m.logger.Info("stopping rollback manager ticker for tests")
				logTestStopOnce = true
			}
			tick.Stop()
		}
	}
}

// triggerRollbacks is used to trigger the rollbacks across all the backends
func (m *RollbackManager) triggerRollbacks() {
	backends := m.backends()

	for _, e := range backends {
		path := e.Path
		if e.Table == credentialTableType {
			path = credentialRoutePrefix + path
		}

		// When the mount is filtered, the backend will be nil
		ctx := namespace.ContextWithNamespace(m.quitContext, e.namespace)
		backend := m.router.MatchingBackend(ctx, path)
		if backend == nil {
			continue
		}
		fullPath := e.namespace.Path + path

		// Start a rollback if necessary
		m.startOrLookupRollback(ctx, fullPath, true)
	}
}

// startOrLookupRollback is used to start an async rollback attempt.
// This must be called with the inflightLock held.
func (m *RollbackManager) startOrLookupRollback(ctx context.Context, fullPath string, grabStatelock bool) *rollbackState {
	m.inflightLock.Lock()
	defer m.inflightLock.Unlock()
	defer metrics.SetGauge([]string{"rollback", "queued"}, float32(m.runner.WaitingQueueSize()))
	defer metrics.SetGauge([]string{"rollback", "inflight"}, float32(len(m.inflight)))
	rsInflight, ok := m.inflight[fullPath]
	if ok {
		return rsInflight
	}

	cancelCtx, cancelFunc := context.WithCancel(context.Background())
	rs := &rollbackState{
		cancelLockGrabCtx:       cancelCtx,
		cancelLockGrabCtxCancel: cancelFunc,
	}

	// If no inflight rollback is already running, kick one off
	m.inflight[fullPath] = rs
	rs.Add(1)
	m.inflightAll.Add(1)
	rs.scheduled = time.Now()
	select {
	case <-m.doneCh:
		// if we've already shut down, then don't submit the task to avoid a panic
		// we should still call finishRollback for the rollback state in order to remove
		// it from the map and decrement the waitgroup.

		// we already have the inflight lock, so we can't grab it here
		m.finishRollback(rs, errors.New("rollback manager is stopped"), fullPath, false)
	default:
		m.runner.Submit(func() {
			m.attemptRollback(ctx, fullPath, rs, grabStatelock)
			select {
			case m.rollbacksDoneCh <- struct{}{}:
			default:
			}
		})

	}
	return rs
}

func (m *RollbackManager) finishRollback(rs *rollbackState, err error, fullPath string, grabInflightLock bool) {
	rs.lastError = err
	rs.Done()
	m.inflightAll.Done()
	if grabInflightLock {
		m.inflightLock.Lock()
		defer m.inflightLock.Unlock()
	}
	delete(m.inflight, fullPath)
}

// attemptRollback invokes a RollbackOperation for the given path
func (m *RollbackManager) attemptRollback(ctx context.Context, fullPath string, rs *rollbackState, grabStatelock bool) (err error) {
	metrics.MeasureSince([]string{"rollback", "waiting"}, rs.scheduled)
	defer metrics.MeasureSince([]string{"rollback", "attempt", strings.ReplaceAll(fullPath, "/", "-")}, time.Now())
	defer m.finishRollback(rs, err, fullPath, true)

	ns, err := namespace.FromContext(ctx)
	if err != nil {
		m.logger.Error("rollback failed to derive namespace from context", "path", fullPath)
		return err
	}
	if ns == nil {
		m.logger.Error("rollback found no namespace", "path", fullPath)
		return namespace.ErrNoNamespace
	}

	// Invoke a RollbackOperation
	req := &logical.Request{
		Operation: logical.RollbackOperation,
		Path:      ns.TrimmedPath(fullPath),
	}

	releaseLock := true
	if grabStatelock {
		doneCh := make(chan struct{})
		defer close(doneCh)

		stopCh := make(chan struct{})
		go func() {
			defer close(stopCh)

			select {
			case <-m.shutdownCh:
			case <-rs.cancelLockGrabCtx.Done():
			case <-doneCh:
			}
		}()

		// Grab the statelock or stop
		l := newLockGrabber(m.core.stateLock.RLock, m.core.stateLock.RUnlock, stopCh)
		go l.grab()
		if stopped := l.lockOrStop(); stopped {
			// If we stopped due to shutdown, return. Otherwise another thread
			// is holding the lock for us, continue on.
			select {
			case <-m.shutdownCh:
				return errors.New("rollback shutting down")
			default:
				releaseLock = false
			}
		}
	}

	var cancelFunc context.CancelFunc
	ctx, cancelFunc = context.WithTimeout(ctx, DefaultMaxRequestDuration)
	resp, err := m.router.Route(ctx, req)
	if grabStatelock && releaseLock {
		m.core.stateLock.RUnlock()
	}
	cancelFunc()

	// If the error is an unsupported operation, then it doesn't
	// matter, the backend doesn't support it.
	if err == logical.ErrUnsupportedOperation {
		err = nil
	}
	// If we failed due to read-only storage, we can't do anything; ignore
	if (err != nil && strings.Contains(err.Error(), logical.ErrReadOnly.Error())) ||
		(resp.IsError() && strings.Contains(resp.Error().Error(), logical.ErrReadOnly.Error())) {
		err = nil
	}
	if err != nil {
		m.logger.Error("error rolling back", "path", fullPath, "error", err)
	}
	return
}

// Rollback is used to trigger an immediate rollback of the path,
// or to join an existing rollback operation if in flight. Caller should have
// core's statelock held (write OR read). If an already inflight rollback is
// happening this function will simply wait for it to complete
func (m *RollbackManager) Rollback(ctx context.Context, path string) error {
	ns, err := namespace.FromContext(ctx)
	if err != nil {
		return err
	}
	fullPath := ns.Path + path

	// Check for an existing attempt or start one if none
	rs := m.startOrLookupRollback(ctx, fullPath, false)

	// Since we have the statelock held, tell any inflight rollback to give up
	// trying to acquire it. This will prevent deadlocks in the case where we
	// have the write lock. In the case where it was waiting to grab
	// a read lock it will then simply continue with the rollback
	// operation under the protection of our write lock.
	rs.cancelLockGrabCtxCancel()

	// It's safe to do this, since the other thread either already has the lock
	// held, or we just canceled it above.
	rs.Wait()

	// Return the last error
	return rs.lastError
}

// The methods below are the hooks from core that are called pre/post seal.

// startRollback is used to start the rollback manager after unsealing
func (c *Core) startRollback() error {
	backendsFunc := func() []*MountEntry {
		ret := []*MountEntry{}
		c.mountsLock.RLock()
		defer c.mountsLock.RUnlock()
		// During teardown/setup after a leader change or unseal there could be
		// something racy here so make sure the table isn't nil
		if c.mounts != nil {
			for _, entry := range c.mounts.Entries {
				ret = append(ret, entry)
			}
		}
		c.authLock.RLock()
		defer c.authLock.RUnlock()
		// During teardown/setup after a leader change or unseal there could be
		// something racy here so make sure the table isn't nil
		if c.auth != nil {
			for _, entry := range c.auth.Entries {
				ret = append(ret, entry)
			}
		}
		return ret
	}
	rollbackLogger := c.baseLogger.Named("rollback")
	c.AddLogger(rollbackLogger)
	c.rollback = NewRollbackManager(c.activeContext, rollbackLogger, backendsFunc, c.router, c)
	c.rollback.Start()
	return nil
}

// stopRollback is used to stop running the rollback manager before sealing
func (c *Core) stopRollback() error {
	if c.rollback != nil {
		c.rollback.Stop()
		c.rollback = nil
	}
	return nil
}