eval delete: move batching of deletes into RPC handler and state (#15117)

During unusual outage recovery scenarios on large clusters, a backlog of millions of evaluations can appear. In these cases, the `eval delete` command can put excessive load on the cluster by listing large sets of evals to extract the IDs and then sending larges batches of IDs. Although the command's batch size was carefully tuned, we still need to be JSON deserialize, re-serialize to MessagePack, send the log entries through raft, and get the FSM applied. To improve performance of this recovery case, move the batching process into the RPC handler and the state store. The design here is a little weird, so let's look a the failed options first: * A naive solution here would be to just send the filter as the raft request and let the FSM apply delete the whole set in a single operation. Benchmarking with 1M evals on a 3 node cluster demonstrated this can block the FSM apply for several minutes, which puts the cluster at risk if there's a leadership failover (the barrier write can't be made while this apply is in-flight). * A less naive but still bad solution would be to have the RPC handler filter and paginate, and then hand a list of IDs to the existing raft log entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and took roughly an hour to complete. Instead, we're filtering and paginating in the RPC handler to find a page token, and then passing both the filter and page token in the raft log. The FSM apply recreates the paginator using the filter and page token to get roughly the same page of evaluations, which it then deletes. The pagination process is fairly cheap (only abut 5% of the total FSM apply time), so counter-intuitively this rework ends up being much faster. A benchmark of 1M evaluations showed this blocked the FSM apply for 20-30ms at a time (typical for normal operations) and completes in less than 4 minutes. Note that, as with the existing design, this delete is not consistent: a new evaluation inserted "behind" the cursor of the pagination will fail to be deleted.
2022-11-14 14:08:13 -05:00 · 2022-11-14 14:08:13 -05:00 · 37134a4a37
parent 345ef0bbec
commit 37134a4a37
13 changed files with 434 additions and 136 deletions
--- a/.changelog/15117.txt
+++ b/.changelog/15117.txt
@ -0,0 +1,3 @@
+```release-note:improvement
+cli: Improved performance of eval delete with large filter sets
+```
--- a/api/evaluations.go
+++ b/api/evaluations.go
@ -62,6 +62,16 @@ func (e *Evaluations) Delete(evalIDs []string, w *WriteOptions) (*WriteMeta, err
 	return wm, nil
 }

+// DeleteOpts is used to batch delete evaluations using a filter.
+func (e *Evaluations) DeleteOpts(req *EvalDeleteRequest, w *WriteOptions) (*EvalDeleteResponse, *WriteMeta, error) {
+	resp := &EvalDeleteResponse{}
+	wm, err := e.client.delete("/v1/evaluations", &req, resp, w)
+	if err != nil {
+		return nil, nil, err
+	}
+	return resp, wm, nil
+}
+
 // Allocations is used to retrieve a set of allocations given
 // an evaluation ID.
 func (e *Evaluations) Allocations(evalID string, q *QueryOptions) ([]*AllocationListStub, *QueryMeta, error) {
@ -140,9 +150,14 @@ type EvaluationStub struct {

 type EvalDeleteRequest struct {
 	EvalIDs []string
+	Filter  string
 	WriteRequest
 }

+type EvalDeleteResponse struct {
+	Count int
+}
+
 type EvalCountResponse struct {
 	Count int
 	QueryMeta
--- a/command/agent/eval_endpoint.go
+++ b/command/agent/eval_endpoint.go
@ -56,13 +56,19 @@ func (s *HTTPServer) evalsDeleteRequest(resp http.ResponseWriter, req *http.Requ

 	numIDs := len(args.EvalIDs)

-	// Ensure the number of evaluation IDs included in the request is within
-	// bounds.
-	if numIDs < 1 {
-		return nil, CodedError(http.StatusBadRequest, "request does not include any evaluation IDs")
-	} else if numIDs > structs.MaxUUIDsPerWriteRequest {
+	if args.Filter != "" && numIDs > 0 {
+		return nil, CodedError(http.StatusBadRequest,
+			"evals cannot be deleted by both ID and filter")
+	}
+	if args.Filter == "" && numIDs == 0 {
+		return nil, CodedError(http.StatusBadRequest,
+			"evals must be deleted by either ID or filter")
+	}
+
+	// If an explicit list of evaluation IDs is sent, ensure its within bounds
+	if numIDs > structs.MaxUUIDsPerWriteRequest {
 		return nil, CodedError(http.StatusBadRequest, fmt.Sprintf(
-			"request includes %v evaluations IDs, must be %v or fewer",
+			"request includes %v evaluation IDs, must be %v or fewer",
 			numIDs, structs.MaxUUIDsPerWriteRequest))
 	}

@ -73,8 +79,9 @@ func (s *HTTPServer) evalsDeleteRequest(resp http.ResponseWriter, req *http.Requ
 	if err := s.agent.RPC(structs.EvalDeleteRPCMethod, &args, &reply); err != nil {
 		return nil, err
 	}
+
 	setIndex(resp, reply.Index)
-	return nil, nil
+	return reply, nil
 }

 func (s *HTTPServer) EvalSpecificRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
--- a/command/agent/eval_endpoint_test.go
+++ b/command/agent/eval_endpoint_test.go
@ -138,7 +138,7 @@ func TestHTTP_EvalsDelete(t *testing.T) {
 					// Make the request and check the response.
 					obj, err := s.Server.EvalsRequest(respW, req)
 					require.Equal(t,
-						CodedError(http.StatusBadRequest, "request does not include any evaluation IDs"), err)
+						CodedError(http.StatusBadRequest, "evals must be deleted by either ID or filter"), err)
 					require.Nil(t, obj)
 				})
 			},
@ -169,7 +169,7 @@ func TestHTTP_EvalsDelete(t *testing.T) {
 					obj, err := s.Server.EvalsRequest(respW, req)
 					require.Equal(t,
 						CodedError(http.StatusBadRequest,
-							"request includes 8000 evaluations IDs, must be 7281 or fewer"), err)
+							"request includes 8000 evaluation IDs, must be 7281 or fewer"), err)
 					require.Nil(t, obj)
 				})
 			},
@ -223,8 +223,10 @@ func TestHTTP_EvalsDelete(t *testing.T) {

 					// Make the request and check the response.
 					obj, err := s.Server.EvalsRequest(respW, req)
-					require.Nil(t, err)
-					require.Nil(t, obj)
+					require.NoError(t, err)
+					require.NotNil(t, obj)
+					deleteResp := obj.(structs.EvalDeleteResponse)
+					require.Equal(t, deleteResp.Count, 1)

 					// Ensure the eval is not found.
 					readEval, err := s.Agent.server.State().EvalByID(nil, mockEval.ID)
--- a/command/eval_delete.go
+++ b/command/eval_delete.go
@ -4,7 +4,6 @@ import (
 	"errors"
 	"fmt"
 	"strings"
-	"time"

 	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/api/contexts"
@ -58,7 +57,9 @@ Eval Delete Options:
  -filter
    Specifies an expression used to filter evaluations by for deletion. When
    using this flag, it is advisable to ensure the syntax is correct using the
-    eval list command first.
+    eval list command first. Note that deleting evals by filter is imprecise:
+    for sets of evals larger than a single raft log batch, evals can be inserted
+    behind the cursor and therefore be missed.

  -yes
    Bypass the confirmation prompt if an evaluation ID was not provided.
@ -148,32 +149,7 @@ func (e *EvalDeleteCommand) Run(args []string) int {
 		e.deleteByArg = true
 		exitCode, err = e.handleEvalArgDelete(args[0])
 	default:
-
-		// Track the next token, so we can iterate all pages that match the
-		// passed filter.
-		var nextToken string
-
-		// It is possible the filter matches a large number of evaluations
-		// which means we need to run a number of batch deletes. Perform
-		// iteration here rather than recursion in later function, so we avoid
-		// any potential issues with stack size limits.
-		for {
-			exitCode, nextToken, err = e.handleFlagFilterDelete(nextToken)
-
-			// If there is another page of evaluations matching the filter,
-			// iterate the loop and delete the next batch of evals. We pause
-			// for a 500ms rather than just run as fast as the code and machine
-			// possibly can. This means deleting 13million evals will take
-			// roughly 13-15 mins, which seems reasonable. It is worth noting,
-			// we do not expect operators to delete this many evals in a single
-			// run and expect more careful filtering options to be used.
-			if nextToken != "" {
-				time.Sleep(500 * time.Millisecond)
-				continue
-			} else {
-				break
-			}
-		}
+		exitCode, err = e.handleDeleteByFilter(e.filter)
 	}

 	// Do not exit if we got an error as it's possible this was on the
@ -228,93 +204,6 @@ func (e *EvalDeleteCommand) handleEvalArgDelete(evalID string) (int, error) {
 	return code, err
 }

-// handleFlagFilterDelete handles deletion of evaluations discovered using
-// the filter. It is unknown how many will match the operator criteria so
-// this function batches lookup and delete requests into sensible numbers.
-func (e *EvalDeleteCommand) handleFlagFilterDelete(nt string) (int, string, error) {
-
-	evalsToDelete, nextToken, err := e.batchLookupEvals(nt)
-	if err != nil {
-		return 1, "", err
-	}
-
-	numEvalsToDelete := len(evalsToDelete)
-
-	// The filter flags are operator controlled, therefore ensure we
-	// actually found some evals to delete. Otherwise, inform the operator
-	// their flags are potentially incorrect.
-	if numEvalsToDelete == 0 {
-		if e.numDeleted > 0 {
-			return 0, "", nil
-		} else {
-			return 1, "", errors.New("failed to find any evals that matched filter criteria")
-		}
-	}
-
-	if code, actioned, err := e.batchDelete(evalsToDelete); err != nil {
-		return code, "", err
-	} else if !actioned {
-		return code, "", nil
-	}
-
-	e.Ui.Info(fmt.Sprintf("Successfully deleted batch of %v %s",
-		numEvalsToDelete, correctGrammar("evaluation", numEvalsToDelete)))
-
-	return 0, nextToken, nil
-}
-
-// batchLookupEvals handles batched lookup of evaluations using the operator
-// provided filter. The lookup is performed a maximum number of 3 times to
-// ensure their size is limited and the number of evals to delete doesn't exceed
-// the total allowable in a single call.
-//
-// The JSON serialized evaluation API object is 350-380B in size.
-// 2426 * 380B (3.8e-4 MB) = 0.92MB. We may want to make this configurable
-// in the future, but this is counteracted by the CLI logic which will loop
-// until the user tells it to exit, or all evals matching the filter are
-// deleted. 2426 * 3 falls below the maximum limit for eval IDs in a single
-// delete request (set by MaxEvalIDsPerDeleteRequest).
-func (e *EvalDeleteCommand) batchLookupEvals(nextToken string) ([]*api.Evaluation, string, error) {
-
-	var evalsToDelete []*api.Evaluation
-	currentNextToken := nextToken
-
-	// Call List 3 times to accumulate the maximum number if eval IDs supported
-	// in a single Delete request. See math above.
-	for i := 0; i < 3; i++ {
-
-		// Generate the query options using the passed next token and filter. The
-		// per page value is less than the total number we can include in a single
-		// delete request. This keeps the maximum size of the return object at a
-		// reasonable size.
-		opts := api.QueryOptions{
-			Filter:    e.filter,
-			PerPage:   2426,
-			NextToken: currentNextToken,
-		}
-
-		evalList, meta, err := e.client.Evaluations().List(&opts)
-		if err != nil {
-			return nil, "", err
-		}
-
-		if len(evalList) > 0 {
-			evalsToDelete = append(evalsToDelete, evalList...)
-		}
-
-		// Store the next token no matter if it is empty or populated.
-		currentNextToken = meta.NextToken
-
-		// If there is no next token, ensure we exit and avoid any new loops
-		// which will result in duplicate IDs.
-		if currentNextToken == "" {
-			break
-		}
-	}
-
-	return evalsToDelete, currentNextToken, nil
-}
-
 // batchDelete is responsible for deleting the passed evaluations and asking
 // any confirmation questions along the way. It will ask whether the operator
 // want to list the evals before deletion, and optionally ask for confirmation
@ -404,3 +293,38 @@ func correctGrammar(word string, num int) string {
 	}
 	return word
 }
+
+func (e *EvalDeleteCommand) handleDeleteByFilter(filterExpr string) (int, error) {
+
+	// If the user did not wish to bypass the confirmation step, ask this now
+	// and handle the response.
+	if !e.yes && !e.deleteByArg {
+
+		resp, _, err := e.client.Evaluations().Count(&api.QueryOptions{
+			Filter: filterExpr,
+		})
+		if err != nil {
+			return 1, err
+		}
+
+		code, deleteEvals := e.askQuestion(fmt.Sprintf(
+			"Are you sure you want to delete %d evals? [y/N]",
+			resp.Count), "Cancelling eval deletion")
+		e.Ui.Output("")
+
+		if !deleteEvals {
+			return code, nil
+		}
+	}
+
+	resp, _, err := e.client.Evaluations().DeleteOpts(&api.EvalDeleteRequest{
+		Filter: filterExpr,
+	}, nil)
+	if err != nil {
+		return 1, err
+	}
+	e.numDeleted = resp.Count
+
+	return 0, nil
+
+}
--- a/nomad/eval_endpoint.go
+++ b/nomad/eval_endpoint.go
@ -11,6 +11,7 @@ import (
 	log "github.com/hashicorp/go-hclog"
 	memdb "github.com/hashicorp/go-memdb"
 	multierror "github.com/hashicorp/go-multierror"
+	version "github.com/hashicorp/go-version"

 	"github.com/hashicorp/nomad/acl"
 	"github.com/hashicorp/nomad/nomad/state"
@ -24,6 +25,8 @@ const (
 	DefaultDequeueTimeout = time.Second
 )

+var minVersionEvalDeleteByFilter = version.Must(version.NewVersion("1.4.3"))
+
 // Eval endpoint is used for eval interactions
 type Eval struct {
 	srv    *Server
@ -438,12 +441,37 @@ func (e *Eval) Delete(
 		return structs.ErrPermissionDenied
 	}

+	if args.Filter != "" && !ServersMeetMinimumVersion(
+		e.srv.Members(), e.srv.Region(), minVersionEvalDeleteByFilter, true) {
+		return fmt.Errorf(
+			"all servers must be running version %v or later to delete evals by filter",
+			minVersionEvalDeleteByFilter)
+	}
+	if args.Filter != "" && len(args.EvalIDs) > 0 {
+		return fmt.Errorf("evals cannot be deleted by both ID and filter")
+	}
+	if args.Filter == "" && len(args.EvalIDs) == 0 {
+		return fmt.Errorf("evals must be deleted by either ID or filter")
+	}
+
 	// The eval broker must be disabled otherwise Nomad's state will likely get
 	// wild in a very un-fun way.
 	if e.srv.evalBroker.Enabled() {
 		return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
 	}

+	if args.Filter != "" {
+		count, index, err := e.deleteEvalsByFilter(args)
+		if err != nil {
+			return err
+		}
+
+		// Update the index and return.
+		reply.Index = index
+		reply.Count = count
+		return nil
+	}
+
 	// Grab the state snapshot, so we can look up relevant eval information.
 	serverStateSnapshot, err := e.srv.State().Snapshot()
 	if err != nil {
@ -451,6 +479,8 @@ func (e *Eval) Delete(
 	}
 	ws := memdb.NewWatchSet()

+	count := 0
+
 	// Iterate the evaluations and ensure they are safe to delete. It is
 	// possible passed evals are not safe to delete and would make Nomads state
 	// a little wonky. The nature of the RPC return error, means a single
@ -471,6 +501,7 @@ func (e *Eval) Delete(
 		if !ok {
 			return fmt.Errorf("eval %s is not safe to delete", evalInfo.ID)
 		}
+		count++
 	}

 	// Generate the Raft request object using the reap request object. This
@ -490,9 +521,97 @@ func (e *Eval) Delete(

 	// Update the index and return.
 	reply.Index = index
+	reply.Count = count
 	return nil
 }

+// deleteEvalsByFilter deletes evaluations in batches based on the filter. It
+// returns a count, the index, and any error
+func (e *Eval) deleteEvalsByFilter(args *structs.EvalDeleteRequest) (int, uint64, error) {
+	count := 0
+	index := uint64(0)
+
+	filter, err := bexpr.CreateEvaluator(args.Filter)
+	if err != nil {
+		return count, index, err
+	}
+
+	// Note that deleting evals by filter is imprecise: For sets of evals larger
+	// than a single batch eval inserts may occur behind the cursor and therefore
+	// be missed. This imprecision is not considered to hurt this endpoint's
+	// purpose of reducing pressure on servers during periods of heavy scheduling
+	// activity.
+	snap, err := e.srv.State().Snapshot()
+	if err != nil {
+		return count, index, fmt.Errorf("failed to lookup state snapshot: %v", err)
+	}
+
+	iter, err := snap.Evals(nil, state.SortDefault)
+	if err != nil {
+		return count, index, err
+	}
+
+	// We *can* send larger raft logs but rough benchmarks for deleting 1M evals
+	// show that a smaller page size strikes a balance between throughput and
+	// time we block the FSM apply for other operations
+	perPage := structs.MaxUUIDsPerWriteRequest / 10
+
+	raftReq := structs.EvalReapRequest{
+		Filter:        args.Filter,
+		PerPage:       int32(perPage),
+		UserInitiated: true,
+		WriteRequest:  args.WriteRequest,
+	}
+
+	// Note: Paginator is designed around fetching a single page for a single
+	// RPC call and finalizes its state after that page. So we're doing our own
+	// pagination here.
+	pageCount := 0
+	lastToken := ""
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+		eval := raw.(*structs.Evaluation)
+		deleteOk, err := snap.EvalIsUserDeleteSafe(nil, eval)
+		if !deleteOk || err != nil {
+			continue
+		}
+		match, err := filter.Evaluate(eval)
+		if !match || err != nil {
+			continue
+		}
+		pageCount++
+		lastToken = eval.ID
+
+		if pageCount >= perPage {
+			raftReq.PerPage = int32(pageCount)
+			_, index, err = e.srv.raftApply(structs.EvalDeleteRequestType, &raftReq)
+			if err != nil {
+				return count, index, err
+			}
+			count += pageCount
+
+			pageCount = 0
+			raftReq.NextToken = lastToken
+		}
+	}
+
+	// send last batch if it's partial
+	if pageCount > 0 {
+		raftReq.PerPage = int32(pageCount)
+		_, index, err = e.srv.raftApply(structs.EvalDeleteRequestType, &raftReq)
+		if err != nil {
+			return count, index, err
+		}
+		count += pageCount
+	}
+
+	return count, index, nil
+}
+
 // List is used to get a list of the evaluations in the system
 func (e *Eval) List(args *structs.EvalListRequest, reply *structs.EvalListResponse) error {
 	if done, err := e.srv.forward("Eval.List", args, args, reply); done {
--- a/nomad/eval_endpoint_test.go
+++ b/nomad/eval_endpoint_test.go
@ -8,6 +8,7 @@ import (
 	"time"

 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/go-set"
 	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
 	"github.com/hashicorp/nomad/acl"
 	"github.com/hashicorp/nomad/ci"
@ -864,6 +865,94 @@ func TestEvalEndpoint_Delete(t *testing.T) {
 		must.EqError(t, err, structs.ErrPermissionDenied.Error())
 	})

+	t.Run("successful delete by filter", func(t *testing.T) {
+
+		testServer, rootToken, cleanup := setup(t)
+		defer cleanup()
+		codec := rpcClient(t, testServer)
+
+		// Ensure broker is disabled
+		setBrokerEnabled(t, testServer, false)
+
+		evalCount := 10000
+		index := uint64(100)
+
+		store := testServer.fsm.State()
+
+		// Create a large set of pending evaluations
+
+		evals := []*structs.Evaluation{}
+		for i := 0; i < evalCount; i++ {
+			mockEval := mock.Eval()
+			evals = append(evals, mockEval)
+		}
+		must.NoError(t, store.UpsertEvals(
+			structs.MsgTypeTestSetup, index, evals))
+
+		// Create some evaluations we don't want to delete
+
+		evalsToKeep := []*structs.Evaluation{}
+		for i := 0; i < 3; i++ {
+			mockEval := mock.Eval()
+			mockEval.JobID = "keepme"
+			evalsToKeep = append(evalsToKeep, mockEval)
+		}
+		index++
+		must.NoError(t, store.UpsertEvals(
+			structs.MsgTypeTestSetup, index, evalsToKeep))
+
+		// Create a job with running allocs and evaluations those allocs reference
+
+		job := mock.Job()
+		job.ID = "notsafetodelete"
+		job.Status = structs.JobStatusRunning
+		index++
+		must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, index, job))
+
+		evalsNotSafeToDelete := []*structs.Evaluation{}
+		for i := 0; i < 3; i++ {
+			mockEval := mock.Eval()
+			mockEval.JobID = job.ID
+			evalsNotSafeToDelete = append(evalsNotSafeToDelete, mockEval)
+		}
+		index++
+		must.NoError(t, store.UpsertEvals(
+			structs.MsgTypeTestSetup, index, evalsNotSafeToDelete))
+
+		allocs := []*structs.Allocation{}
+		for i := 0; i < 3; i++ {
+			alloc := mock.Alloc()
+			alloc.ClientStatus = structs.AllocClientStatusRunning
+			alloc.EvalID = evalsNotSafeToDelete[i].ID
+			allocs = append(allocs, alloc)
+		}
+		index++
+		must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, index, allocs))
+
+		// Delete all the unwanted evals
+
+		get := &structs.EvalDeleteRequest{
+			Filter:       "JobID != \"keepme\"",
+			WriteRequest: structs.WriteRequest{AuthToken: rootToken.SecretID, Region: "global"},
+		}
+		var resp structs.EvalDeleteResponse
+		must.NoError(t, msgpackrpc.CallWithCodec(codec, structs.EvalDeleteRPCMethod, get, &resp))
+		must.Eq(t, resp.Count, evalCount)
+
+		// Assert we didn't delete the filtered evals
+		gotKeptEvals, err := store.EvalsByJob(nil, job.Namespace, "keepme")
+		must.NoError(t, err)
+		must.Len(t, 3, gotKeptEvals)
+		must.Eq(t, set.From(evalsToKeep), set.From(gotKeptEvals))
+
+		// Assert we didn't delete the evals that were not safe to delete
+		gotNotSafeEvals, err := store.EvalsByJob(nil, job.Namespace, "notsafetodelete")
+		must.NoError(t, err)
+		must.Len(t, 3, gotNotSafeEvals)
+		must.Eq(t, set.From(evalsNotSafeToDelete), set.From(gotNotSafeEvals))
+
+	})
+
 }

 func TestEvalEndpoint_List(t *testing.T) {
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@ -804,6 +804,14 @@ func (n *nomadFSM) applyDeleteEval(buf []byte, index uint64) interface{} {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}

+	if req.Filter != "" {
+		if err := n.state.DeleteEvalsByFilter(index, req.Filter, req.NextToken, req.PerPage); err != nil {
+			n.logger.Error("DeleteEvalsByFilter failed", "error", err)
+			return err
+		}
+		return nil
+	}
+
 	if err := n.state.DeleteEval(index, req.Evals, req.Allocs, req.UserInitiated); err != nil {
 		n.logger.Error("DeleteEval failed", "error", err)
 		return err
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@ -9,6 +9,7 @@ import (
 	"strings"
 	"time"

+	"github.com/hashicorp/go-bexpr"
 	"github.com/hashicorp/go-hclog"
 	"github.com/hashicorp/go-memdb"
 	"github.com/hashicorp/go-multierror"
@ -3143,6 +3144,66 @@ func (s *StateStore) updateEvalModifyIndex(txn *txn, index uint64, evalID string
 	return nil
 }

+// DeleteEvalsByFilter is used to delete all evals that are both safe to delete
+// and match a filter.
+func (s *StateStore) DeleteEvalsByFilter(index uint64, filterExpr string, pageToken string, perPage int32) error {
+	txn := s.db.WriteTxn(index)
+	defer txn.Abort()
+
+	// These are always user-initiated, so ensure the eval broker is paused.
+	_, schedConfig, err := s.schedulerConfigTxn(txn)
+	if err != nil {
+		return err
+	}
+	if schedConfig == nil || !schedConfig.PauseEvalBroker {
+		return errors.New("eval broker is enabled; eval broker must be paused to delete evals")
+	}
+
+	filter, err := bexpr.CreateEvaluator(filterExpr)
+	if err != nil {
+		return err
+	}
+
+	iter, err := s.Evals(nil, SortDefault)
+	if err != nil {
+		return fmt.Errorf("failed to lookup evals: %v", err)
+	}
+
+	// Note: Paginator imports this package for testing so we can't just use
+	// Paginator
+	pageCount := int32(0)
+
+	for {
+		if pageCount >= perPage {
+			break
+		}
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+		eval := raw.(*structs.Evaluation)
+		if eval.ID < pageToken {
+			continue
+		}
+
+		deleteOk, err := s.EvalIsUserDeleteSafe(nil, eval)
+		if !deleteOk || err != nil {
+			continue
+		}
+		match, err := filter.Evaluate(eval)
+		if !match || err != nil {
+			continue
+		}
+		if err := txn.Delete("evals", eval); err != nil {
+			return fmt.Errorf("eval delete failed: %v", err)
+		}
+		pageCount++
+	}
+
+	err = txn.Commit()
+	return err
+}
+
 // EvalIsUserDeleteSafe ensures an evaluation is safe to delete based on its
 // related allocation and job information. This follows similar, but different
 // rules to the eval reap checking, to ensure evaluations for running allocs or
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@ -16,6 +16,7 @@ import (
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/kr/pretty"
+	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@ -4498,6 +4499,67 @@ func TestStateStore_DeleteEval_UserInitiated(t *testing.T) {
 	require.Nil(t, mockEval2Lookup)
 }

+// TestStateStore_DeleteEvalsByFilter_Pagination tests the pagination logic for
+// deleting evals by filter; the business logic is tested more fully in the eval
+// endpoint tests.
+func TestStateStore_DeleteEvalsByFilter_Pagination(t *testing.T) {
+
+	evalCount := 100
+	index := uint64(100)
+
+	store := testStateStore(t)
+
+	// Create a set of pending evaluations
+
+	schedulerConfig := &structs.SchedulerConfiguration{
+		PauseEvalBroker: true,
+		CreateIndex:     index,
+		ModifyIndex:     index,
+	}
+	must.NoError(t, store.SchedulerSetConfig(index, schedulerConfig))
+
+	evals := []*structs.Evaluation{}
+	for i := 0; i < evalCount; i++ {
+		mockEval := mock.Eval()
+		evals = append(evals, mockEval)
+	}
+	index++
+	must.NoError(t, store.UpsertEvals(
+		structs.MsgTypeTestSetup, index, evals))
+
+	// Delete one page
+	index++
+	must.NoError(t, store.DeleteEvalsByFilter(index, "JobID != \"\"", "", 10))
+
+	countRemaining := func() (string, int) {
+		lastSeen := ""
+		remaining := 0
+
+		iter, err := store.Evals(nil, SortDefault)
+		must.NoError(t, err)
+		for {
+			raw := iter.Next()
+			if raw == nil {
+				break
+			}
+			eval := raw.(*structs.Evaluation)
+			lastSeen = eval.ID
+			remaining++
+		}
+		return lastSeen, remaining
+	}
+
+	lastSeen, remaining := countRemaining()
+	must.Eq(t, 90, remaining)
+
+	// Delete starting from lastSeen, which should only delete 1
+	index++
+	must.NoError(t, store.DeleteEvalsByFilter(index, "JobID != \"\"", lastSeen, 10))
+
+	_, remaining = countRemaining()
+	must.Eq(t, 89, remaining)
+}
+
 func TestStateStore_EvalIsUserDeleteSafe(t *testing.T) {
 	ci.Parallel(t)

--- a/nomad/structs/eval.go
+++ b/nomad/structs/eval.go
@ -14,11 +14,17 @@ const (
 // not be greater than MaxEvalIDsPerDeleteRequest.
 type EvalDeleteRequest struct {
 	EvalIDs []string
+
+	// Filter specifies the go-bexpr filter expression to be used for deleting a
+	// set of evaluations that matches the filter
+	Filter string
+
 	WriteRequest
 }

 // EvalDeleteResponse is the response object when one or more evaluation are
 // deleted manually by an operator.
 type EvalDeleteResponse struct {
+	Count int // how many Evaluations were safe to delete and/or matched the filter
 	WriteMeta
 }
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@ -849,8 +849,14 @@ type EvalUpdateRequest struct {
 // Eval.Delete use the same Raft message when performing deletes so we do not
 // need more Raft message types.
 type EvalReapRequest struct {
-	Evals  []string
-	Allocs []string
+	Evals  []string // slice of Evaluation IDs
+	Allocs []string // slice of Allocation IDs
+
+	// Filter specifies the go-bexpr filter expression to be used for
+	// filtering the data prior to returning a response
+	Filter    string
+	PerPage   int32
+	NextToken string

 	// UserInitiated tracks whether this reap request is the result of an
 	// operator request. If this is true, the FSM needs to ensure the eval
--- a/website/content/docs/commands/eval/delete.mdx
+++ b/website/content/docs/commands/eval/delete.mdx
@ -36,7 +36,10 @@ When ACLs are enabled, this command requires a `management` token.
 ## Delete Options

 - `-filter`: Specifies an expression used to filter evaluations by for
-  deletion.
+  deletion. When using this flag, it is advisable to ensure the syntax is
+  correct using the eval list command first. Note that deleting evals by filter
+  is imprecise: for sets of evals larger than a single raft log batch, evals can
+  be inserted behind the cursor and therefore be missed.

 - `-yes`: Bypass the confirmation prompt if an evaluation ID was not provided.

@ -53,13 +56,6 @@ Delete all evaluations with status `pending` for the `example` job:

 ```shell-session
 $ nomad eval delete -filter='Stauts == "pending" and JobID == "example"'
-Do you want to list evals (3) before deletion? [y/N] y
-
-ID        Priority  Triggered By  Job ID   Namespace   Node ID   Status   Placement Failures
-cef92121  50        job-register  example  default     <none>    pending  false
-1c905ca0  50        job-register  example  default     <none>    pending  false
-b9e77692  50        job-register  example  default     <none>    pending  false
-
 Are you sure you want to delete 3 evals? [y/N] y

 Successfuly deleted 3 evaluations