open-nomad/command/agent/eval_endpoint.go
Tim Gross 37134a4a37
eval delete: move batching of deletes into RPC handler and state (#15117)
During unusual outage recovery scenarios on large clusters, a backlog of
millions of evaluations can appear. In these cases, the `eval delete` command can
put excessive load on the cluster by listing large sets of evals to extract the
IDs and then sending larges batches of IDs. Although the command's batch size
was carefully tuned, we still need to be JSON deserialize, re-serialize to
MessagePack, send the log entries through raft, and get the FSM applied.

To improve performance of this recovery case, move the batching process into the
RPC handler and the state store. The design here is a little weird, so let's
look a the failed options first:

* A naive solution here would be to just send the filter as the raft request and
  let the FSM apply delete the whole set in a single operation. Benchmarking with
  1M evals on a 3 node cluster demonstrated this can block the FSM apply for
  several minutes, which puts the cluster at risk if there's a leadership
  failover (the barrier write can't be made while this apply is in-flight).

* A less naive but still bad solution would be to have the RPC handler filter
  and paginate, and then hand a list of IDs to the existing raft log
  entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and
  took roughly an hour to complete.

Instead, we're filtering and paginating in the RPC handler to find a page token,
and then passing both the filter and page token in the raft log. The FSM apply
recreates the paginator using the filter and page token to get roughly the same
page of evaluations, which it then deletes. The pagination process is fairly
cheap (only abut 5% of the total FSM apply time), so counter-intuitively this
rework ends up being much faster. A benchmark of 1M evaluations showed this
blocked the FSM apply for 20-30ms at a time (typical for normal operations) and
completes in less than 4 minutes.

Note that, as with the existing design, this delete is not consistent: a new
evaluation inserted "behind" the cursor of the pagination will fail to be
deleted.
2022-11-14 14:08:13 -05:00

167 lines
4.5 KiB
Go

package agent
import (
"fmt"
"net/http"
"strings"
"github.com/hashicorp/nomad/nomad/structs"
)
// EvalsRequest is the entry point for /v1/evaluations and is responsible for
// handling both the listing of evaluations, and the bulk deletion of
// evaluations. The latter is a dangerous operation and should use the
// eval delete command to perform this.
func (s *HTTPServer) EvalsRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
switch req.Method {
case http.MethodGet:
return s.evalsListRequest(resp, req)
case http.MethodDelete:
return s.evalsDeleteRequest(resp, req)
default:
return nil, CodedError(http.StatusMethodNotAllowed, ErrInvalidMethod)
}
}
func (s *HTTPServer) evalsListRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
args := structs.EvalListRequest{}
if s.parse(resp, req, &args.Region, &args.QueryOptions) {
return nil, nil
}
query := req.URL.Query()
args.FilterEvalStatus = query.Get("status")
args.FilterJobID = query.Get("job")
var out structs.EvalListResponse
if err := s.agent.RPC("Eval.List", &args, &out); err != nil {
return nil, err
}
setMeta(resp, &out.QueryMeta)
if out.Evaluations == nil {
out.Evaluations = make([]*structs.Evaluation, 0)
}
return out.Evaluations, nil
}
func (s *HTTPServer) evalsDeleteRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args structs.EvalDeleteRequest
if err := decodeBody(req, &args); err != nil {
return nil, CodedError(http.StatusBadRequest, err.Error())
}
numIDs := len(args.EvalIDs)
if args.Filter != "" && numIDs > 0 {
return nil, CodedError(http.StatusBadRequest,
"evals cannot be deleted by both ID and filter")
}
if args.Filter == "" && numIDs == 0 {
return nil, CodedError(http.StatusBadRequest,
"evals must be deleted by either ID or filter")
}
// If an explicit list of evaluation IDs is sent, ensure its within bounds
if numIDs > structs.MaxUUIDsPerWriteRequest {
return nil, CodedError(http.StatusBadRequest, fmt.Sprintf(
"request includes %v evaluation IDs, must be %v or fewer",
numIDs, structs.MaxUUIDsPerWriteRequest))
}
// Pass the write request to populate all meta fields.
s.parseWriteRequest(req, &args.WriteRequest)
var reply structs.EvalDeleteResponse
if err := s.agent.RPC(structs.EvalDeleteRPCMethod, &args, &reply); err != nil {
return nil, err
}
setIndex(resp, reply.Index)
return reply, nil
}
func (s *HTTPServer) EvalSpecificRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
path := strings.TrimPrefix(req.URL.Path, "/v1/evaluation/")
switch {
case strings.HasSuffix(path, "/allocations"):
evalID := strings.TrimSuffix(path, "/allocations")
return s.evalAllocations(resp, req, evalID)
default:
return s.evalQuery(resp, req, path)
}
}
func (s *HTTPServer) evalAllocations(resp http.ResponseWriter, req *http.Request, evalID string) (interface{}, error) {
if req.Method != "GET" {
return nil, CodedError(405, ErrInvalidMethod)
}
args := structs.EvalSpecificRequest{
EvalID: evalID,
}
if s.parse(resp, req, &args.Region, &args.QueryOptions) {
return nil, nil
}
var out structs.EvalAllocationsResponse
if err := s.agent.RPC("Eval.Allocations", &args, &out); err != nil {
return nil, err
}
setMeta(resp, &out.QueryMeta)
if out.Allocations == nil {
out.Allocations = make([]*structs.AllocListStub, 0)
}
return out.Allocations, nil
}
func (s *HTTPServer) evalQuery(resp http.ResponseWriter, req *http.Request, evalID string) (interface{}, error) {
if req.Method != "GET" {
return nil, CodedError(405, ErrInvalidMethod)
}
args := structs.EvalSpecificRequest{
EvalID: evalID,
}
if s.parse(resp, req, &args.Region, &args.QueryOptions) {
return nil, nil
}
query := req.URL.Query()
args.IncludeRelated = query.Get("related") == "true"
var out structs.SingleEvalResponse
if err := s.agent.RPC("Eval.GetEval", &args, &out); err != nil {
return nil, err
}
setMeta(resp, &out.QueryMeta)
if out.Eval == nil {
return nil, CodedError(404, "eval not found")
}
return out.Eval, nil
}
func (s *HTTPServer) EvalsCountRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
if req.Method != http.MethodGet {
return nil, CodedError(http.StatusMethodNotAllowed, ErrInvalidMethod)
}
args := structs.EvalCountRequest{}
if s.parse(resp, req, &args.Region, &args.QueryOptions) {
return nil, nil
}
var out structs.EvalCountResponse
if err := s.agent.RPC("Eval.Count", &args, &out); err != nil {
return nil, err
}
setMeta(resp, &out.QueryMeta)
return &out, nil
}