open-consul/agent/consul/watch/server_local.go

package watch

import (
	"context"
	"errors"
	"fmt"
	"time"

	"github.com/hashicorp/go-memdb"
	hashstructure_v2 "github.com/mitchellh/hashstructure/v2"

	"github.com/hashicorp/consul/lib/retry"
)

var (
	ErrorNotFound   = errors.New("no data found for query")
	ErrorNotChanged = errors.New("data did not change for query")

	errNilContext  = errors.New("cannot call ServerLocalNotify with a nil context")
	errNilGetStore = errors.New("cannot call ServerLocalNotify without a callback to get a StateStore")
	errNilQuery    = errors.New("cannot call ServerLocalNotify without a callback to perform the query")
	errNilNotify   = errors.New("cannot call ServerLocalNotify without a callback to send notifications")
)

//go:generate mockery --name StateStore --inpackage --filename mock_StateStore_test.go
type StateStore interface {
	AbandonCh() <-chan struct{}
}

const (
	defaultWaiterMinFailures uint = 1
	defaultWaiterMinWait          = time.Second
	defaultWaiterMaxWait          = 60 * time.Second
	defaultWaiterFactor           = 2 * time.Second
)

var (
	defaultWaiterJitter = retry.NewJitter(100)
)

func defaultWaiter() *retry.Waiter {
	return &retry.Waiter{
		MinFailures: defaultWaiterMinFailures,
		MinWait:     defaultWaiterMinWait,
		MaxWait:     defaultWaiterMaxWait,
		Jitter:      defaultWaiterJitter,
		Factor:      defaultWaiterFactor,
	}
}

// noopDone can be passed to serverLocalNotifyWithWaiter
func noopDone() {}

// ServerLocalBlockingQuery performs a blocking query similar to the pre-existing blockingQuery
// method on the agent/consul.Server type. There are a few key differences.
//
//  1. This function makes use of Go 1.18 generics. The function is parameterized with two
//     types. The first is the ResultType which can be anything. Having this be parameterized
//     instead of using interface{} allows us to simplify the call sites so that no type
//     coercion from interface{} to the real type is necessary. The second parameterized type
//     is something that VERY loosely resembles a agent/consul/state.Store type. The StateStore
//     interface in this package has a single method to get the stores abandon channel so we
//     know when a snapshot restore is occurring and can act accordingly. We could have not
//     parameterized this type and used a real *state.Store instead but then we would have
//     concrete dependencies on the state package and it would make it a little harder to
//     test this function.
//
//     We could have also avoided the need to use a ResultType parameter by taking the route
//     the original blockingQuery method did and to just assume all callers close around
//     a pointer to their results and can modify it as necessary. That way of doing things
//     feels a little gross so I have taken this one a different direction. The old way
//     also gets especially gross with how we have to push concerns of spurious wakeup
//     suppression down into every call site.
//
//  2. This method has no internal timeout and can potentially run forever until a state
//     change is observed. If there is a desire to have a timeout, that should be built into
//     the context.Context passed as the first argument.
//
//  3. This method bakes in some newer functionality around hashing of results to prevent sending
//     back data when nothing has actually changed. With the old blockingQuery method this has to
//     be done within the closure passed to the method which means the same bit of code is duplicated
//     in many places. As this functionality isn't necessary in many scenarios whether to opt-in to
//     that behavior is a argument to this function.
//
// Similar to the older method:
//
// 1. Errors returned from the query will be propagated back to the caller.
//
// The query function must follow these rules:
//
//  1. To access data it must use the passed in StoreType (which will be a state.Store when
//     everything gets stiched together outside of unit tests).
//  2. It must return an index greater than the minIndex if the results returned by the query
//     have changed.
//  3. Any channels added to the memdb.WatchSet must unblock when the results
//     returned by the query have changed.
//
// To ensure optimal performance of the query, the query function should make a
// best-effort attempt to follow these guidelines:
//
//  1. Only return an index greater than the minIndex.
//  2. Any channels added to the memdb.WatchSet should only unblock when the
//     results returned by the query have changed. This might be difficult
//     to do when blocking on non-existent data.
func ServerLocalBlockingQuery[ResultType any, StoreType StateStore](
	ctx context.Context,
	getStore func() StoreType,
	minIndex uint64,
	suppressSpuriousWakeup bool,
	query func(memdb.WatchSet, StoreType) (uint64, ResultType, error),
) (uint64, ResultType, error) {
	var (
		notFound  bool
		ranOnce   bool
		priorHash uint64
	)

	var zeroResult ResultType
	if getStore == nil {
		return 0, zeroResult, fmt.Errorf("no getStore function was provided to ServerLocalBlockingQuery")
	}
	if query == nil {
		return 0, zeroResult, fmt.Errorf("no query function was provided to ServerLocalBlockingQuery")
	}

	for {
		state := getStore()

		ws := memdb.NewWatchSet()

		// Adding the AbandonCh to the WatchSet allows us to detect when
		// a snapshot restore happens that would otherwise not modify anything
		// within the individual state store. If we didn't do this then we
		// could end up blocking indefinitely.
		ws.Add(state.AbandonCh())

		index, result, err := query(ws, state)
		// Always set a non-zero index. Generally we expect the index
		// to be set to Raft index which can never be 0. If the query
		// returned no results we expect it to be set to the max index of the table,
		// however we can't guarantee this always happens.
		// To prevent a client from accidentally performing many non-blocking queries
		// (which causes lots of unnecessary load), we always set a default value of 1.
		// This is sufficient to prevent the unnecessary load in most cases.
		if index < 1 {
			index = 1
		}

		switch {
		case errors.Is(err, ErrorNotFound):
			// if minIndex is 0 then we should never block but we
			// also should not propagate the error
			if minIndex == 0 {
				return index, result, nil
			}

			// update the min index if the previous result was not found. This
			// is an attempt to not return data unnecessarily when we end up
			// watching the root of a memdb Radix tree because the data being
			// watched doesn't exist yet.
			if notFound {
				minIndex = index
			}

			notFound = true
		case err != nil:
			return index, result, err
		}

		// when enabled we can prevent sending back data that hasn't changed.
		if suppressSpuriousWakeup {
			newHash, err := hashstructure_v2.Hash(result, hashstructure_v2.FormatV2, nil)
			if err != nil {
				return index, result, fmt.Errorf("error hashing data for spurious wakeup suppression: %w", err)
			}

			// set minIndex to the returned index to prevent sending back identical data
			if ranOnce && priorHash == newHash {
				minIndex = index
			}
			ranOnce = true
			priorHash = newHash
		}

		// one final check if we should be considered unblocked and
		// return the value. Some conditions in the switch above
		// alter the minIndex and prevent this return if it would
		// be desirable. One such case is when the actual data has
		// not changed since the last round through the query and
		// we would rather not do any further processing for unchanged
		// data. This mostly protects against watches for data that
		// doesn't exist from return the non-existant value constantly.
		if index > minIndex {
			return index, result, nil
		}

		// Block until something changes. Because we have added the state
		// stores AbandonCh to this watch set, a snapshot restore will
		// cause things to unblock in addition to changes to the actual
		// queried data.
		if err := ws.WatchCtx(ctx); err != nil {
			// exit if the context was cancelled
			return index, result, nil
		}

		select {
		case <-state.AbandonCh():
			return index, result, nil
		default:
		}
	}
}

// ServerLocalNotify will watch for changes in the State Store using the provided
// query function and invoke the notify callback whenever the results of that query
// function have changed. This function will return an error if parameter validations
// fail but otherwise the background go routine to process the notifications will
// be spawned and nil will be returned. Just like ServerLocalBlockingQuery this makes
// use of Go Generics and for the same reasons as outlined in the documentation for
// that function.
func ServerLocalNotify[ResultType any, StoreType StateStore](
	ctx context.Context,
	correlationID string,
	getStore func() StoreType,
	query func(memdb.WatchSet, StoreType) (uint64, ResultType, error),
	notify func(ctx context.Context, correlationID string, result ResultType, err error),
) error {
	return serverLocalNotify(
		ctx,
		correlationID,
		getStore,
		query,
		notify,
		// Public callers should not need to know when the internal go routines are finished.
		// Being able to provide a done function to the internal version of this function is
		// to allow our tests to be more determinstic and to eliminate arbitrary sleeps.
		noopDone,
		// Public callers do not get to override the error backoff configuration. Internally
		// we want to allow for this to enable our unit tests to run much more quickly.
		defaultWaiter(),
	)
}

// serverLocalNotify is the internal version of ServerLocalNotify. It takes
// two additional arguments of the waiter to use and a function to call
// when the notification go routine has finished
func serverLocalNotify[ResultType any, StoreType StateStore](
	ctx context.Context,
	correlationID string,
	getStore func() StoreType,
	query func(memdb.WatchSet, StoreType) (uint64, ResultType, error),
	notify func(ctx context.Context, correlationID string, result ResultType, err error),
	done func(),
	waiter *retry.Waiter,
) error {
	if ctx == nil {
		return errNilContext
	}

	if getStore == nil {
		return errNilGetStore
	}

	if query == nil {
		return errNilQuery
	}

	if notify == nil {
		return errNilNotify
	}

	go serverLocalNotifyRoutine(
		ctx,
		correlationID,
		getStore,
		query,
		notify,
		done,
		waiter,
	)
	return nil
}

// serverLocalNotifyRoutine is the function intended to be run within a new
// go routine to process the updates. It will not check to ensure callbacks
// are non-nil nor perform other parameter validation. It is assumed that
// the in-package caller of this method will have already done that. It also
// takes the backoff waiter in as an argument so that unit tests within this
// package can override the default values that the exported ServerLocalNotify
// function would have set up.
func serverLocalNotifyRoutine[ResultType any, StoreType StateStore](
	ctx context.Context,
	correlationID string,
	getStore func() StoreType,
	query func(memdb.WatchSet, StoreType) (uint64, ResultType, error),
	notify func(ctx context.Context, correlationID string, result ResultType, err error),
	done func(),
	waiter *retry.Waiter,
) {
	defer done()

	var minIndex uint64

	for {
		// Check if the context has been cancelled. Do not issue
		// more queries if it has been cancelled.
		if ctx.Err() != nil {
			return
		}

		// Perform the blocking query
		index, result, err := ServerLocalBlockingQuery(ctx, getStore, minIndex, true, query)

		// Check if the context has been cancelled. If it has we should not send more
		// notifications.
		if ctx.Err() != nil {
			return
		}

		// Check the index to see if we should call notify
		if minIndex == 0 || minIndex < index {
			notify(ctx, correlationID, result, err)
			minIndex = index
		}

		// Handle errors with backoff. Badly behaved blocking calls that returned
		// a zero index are considered as failures since we need to not get stuck
		// in a busy loop.
		if err == nil && index > 0 {
			waiter.Reset()
		} else {
			if waiter.Wait(ctx) != nil {
				return
			}
		}

		// ensure we don't use zero indexes
		if err == nil && minIndex < 1 {
			minIndex = 1
		}
	}
}