2015-09-09 01:42:34 +00:00
|
|
|
package api
|
|
|
|
|
|
|
|
import (
|
2015-09-17 19:40:51 +00:00
|
|
|
"sort"
|
2015-09-09 01:42:34 +00:00
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
2015-09-09 20:48:56 +00:00
|
|
|
// Evaluations is used to query the evaluation endpoints.
|
|
|
|
type Evaluations struct {
|
|
|
|
client *Client
|
|
|
|
}
|
|
|
|
|
|
|
|
// Evaluations returns a new handle on the evaluations.
|
|
|
|
func (c *Client) Evaluations() *Evaluations {
|
|
|
|
return &Evaluations{client: c}
|
|
|
|
}
|
|
|
|
|
|
|
|
// List is used to dump all of the evaluations.
|
|
|
|
func (e *Evaluations) List(q *QueryOptions) ([]*Evaluation, *QueryMeta, error) {
|
|
|
|
var resp []*Evaluation
|
|
|
|
qm, err := e.client.query("/v1/evaluations", &resp, q)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
2015-09-17 19:40:51 +00:00
|
|
|
sort.Sort(EvalIndexSort(resp))
|
2015-09-09 20:48:56 +00:00
|
|
|
return resp, qm, nil
|
|
|
|
}
|
|
|
|
|
2015-12-24 10:46:59 +00:00
|
|
|
func (e *Evaluations) PrefixList(prefix string) ([]*Evaluation, *QueryMeta, error) {
|
|
|
|
return e.List(&QueryOptions{Prefix: prefix})
|
|
|
|
}
|
|
|
|
|
2022-11-07 13:53:19 +00:00
|
|
|
// Count is used to get a count of evaluations.
|
|
|
|
func (e *Evaluations) Count(q *QueryOptions) (*EvalCountResponse, *QueryMeta, error) {
|
|
|
|
var resp *EvalCountResponse
|
|
|
|
qm, err := e.client.query("/v1/evaluations/count", &resp, q)
|
|
|
|
if err != nil {
|
|
|
|
return resp, nil, err
|
|
|
|
}
|
|
|
|
return resp, qm, nil
|
|
|
|
}
|
|
|
|
|
2015-09-09 20:48:56 +00:00
|
|
|
// Info is used to query a single evaluation by its ID.
|
|
|
|
func (e *Evaluations) Info(evalID string, q *QueryOptions) (*Evaluation, *QueryMeta, error) {
|
|
|
|
var resp Evaluation
|
|
|
|
qm, err := e.client.query("/v1/evaluation/"+evalID, &resp, q)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
return &resp, qm, nil
|
|
|
|
}
|
|
|
|
|
2022-07-06 14:30:11 +00:00
|
|
|
// Delete is used to batch delete evaluations using their IDs.
|
|
|
|
func (e *Evaluations) Delete(evalIDs []string, w *WriteOptions) (*WriteMeta, error) {
|
|
|
|
req := EvalDeleteRequest{
|
|
|
|
EvalIDs: evalIDs,
|
|
|
|
}
|
|
|
|
wm, err := e.client.delete("/v1/evaluations", &req, nil, w)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return wm, nil
|
|
|
|
}
|
|
|
|
|
eval delete: move batching of deletes into RPC handler and state (#15117)
During unusual outage recovery scenarios on large clusters, a backlog of
millions of evaluations can appear. In these cases, the `eval delete` command can
put excessive load on the cluster by listing large sets of evals to extract the
IDs and then sending larges batches of IDs. Although the command's batch size
was carefully tuned, we still need to be JSON deserialize, re-serialize to
MessagePack, send the log entries through raft, and get the FSM applied.
To improve performance of this recovery case, move the batching process into the
RPC handler and the state store. The design here is a little weird, so let's
look a the failed options first:
* A naive solution here would be to just send the filter as the raft request and
let the FSM apply delete the whole set in a single operation. Benchmarking with
1M evals on a 3 node cluster demonstrated this can block the FSM apply for
several minutes, which puts the cluster at risk if there's a leadership
failover (the barrier write can't be made while this apply is in-flight).
* A less naive but still bad solution would be to have the RPC handler filter
and paginate, and then hand a list of IDs to the existing raft log
entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and
took roughly an hour to complete.
Instead, we're filtering and paginating in the RPC handler to find a page token,
and then passing both the filter and page token in the raft log. The FSM apply
recreates the paginator using the filter and page token to get roughly the same
page of evaluations, which it then deletes. The pagination process is fairly
cheap (only abut 5% of the total FSM apply time), so counter-intuitively this
rework ends up being much faster. A benchmark of 1M evaluations showed this
blocked the FSM apply for 20-30ms at a time (typical for normal operations) and
completes in less than 4 minutes.
Note that, as with the existing design, this delete is not consistent: a new
evaluation inserted "behind" the cursor of the pagination will fail to be
deleted.
2022-11-14 19:08:13 +00:00
|
|
|
// DeleteOpts is used to batch delete evaluations using a filter.
|
|
|
|
func (e *Evaluations) DeleteOpts(req *EvalDeleteRequest, w *WriteOptions) (*EvalDeleteResponse, *WriteMeta, error) {
|
|
|
|
resp := &EvalDeleteResponse{}
|
|
|
|
wm, err := e.client.delete("/v1/evaluations", &req, resp, w)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
return resp, wm, nil
|
|
|
|
}
|
|
|
|
|
2015-09-09 20:48:56 +00:00
|
|
|
// Allocations is used to retrieve a set of allocations given
|
|
|
|
// an evaluation ID.
|
2015-09-14 02:55:47 +00:00
|
|
|
func (e *Evaluations) Allocations(evalID string, q *QueryOptions) ([]*AllocationListStub, *QueryMeta, error) {
|
|
|
|
var resp []*AllocationListStub
|
2015-09-09 20:48:56 +00:00
|
|
|
qm, err := e.client.query("/v1/evaluation/"+evalID+"/allocations", &resp, q)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
2015-09-17 19:40:51 +00:00
|
|
|
sort.Sort(AllocIndexSort(resp))
|
2015-09-09 20:48:56 +00:00
|
|
|
return resp, qm, nil
|
|
|
|
}
|
|
|
|
|
2022-08-02 14:33:08 +00:00
|
|
|
const (
|
|
|
|
EvalStatusBlocked = "blocked"
|
|
|
|
EvalStatusPending = "pending"
|
|
|
|
EvalStatusComplete = "complete"
|
|
|
|
EvalStatusFailed = "failed"
|
|
|
|
EvalStatusCancelled = "canceled"
|
|
|
|
)
|
|
|
|
|
2015-09-09 01:42:34 +00:00
|
|
|
// Evaluation is used to serialize an evaluation.
|
|
|
|
type Evaluation struct {
|
2017-03-09 20:37:41 +00:00
|
|
|
ID string
|
|
|
|
Priority int
|
|
|
|
Type string
|
|
|
|
TriggeredBy string
|
2017-09-07 23:56:15 +00:00
|
|
|
Namespace string
|
2017-03-09 20:37:41 +00:00
|
|
|
JobID string
|
|
|
|
JobModifyIndex uint64
|
|
|
|
NodeID string
|
|
|
|
NodeModifyIndex uint64
|
2017-06-26 21:23:52 +00:00
|
|
|
DeploymentID string
|
2017-03-09 20:37:41 +00:00
|
|
|
Status string
|
|
|
|
StatusDescription string
|
|
|
|
Wait time.Duration
|
2018-03-13 15:06:26 +00:00
|
|
|
WaitUntil time.Time
|
2017-03-09 20:37:41 +00:00
|
|
|
NextEval string
|
|
|
|
PreviousEval string
|
|
|
|
BlockedEval string
|
2022-03-17 17:56:14 +00:00
|
|
|
RelatedEvals []*EvaluationStub
|
2017-03-09 20:37:41 +00:00
|
|
|
FailedTGAllocs map[string]*AllocationMetric
|
|
|
|
ClassEligibility map[string]bool
|
|
|
|
EscapedComputedClass bool
|
2017-10-13 21:36:02 +00:00
|
|
|
QuotaLimitReached string
|
2017-03-09 20:37:41 +00:00
|
|
|
AnnotatePlan bool
|
|
|
|
QueuedAllocations map[string]int
|
|
|
|
SnapshotIndex uint64
|
|
|
|
CreateIndex uint64
|
|
|
|
ModifyIndex uint64
|
2019-08-07 16:50:35 +00:00
|
|
|
CreateTime int64
|
|
|
|
ModifyTime int64
|
2015-09-17 19:40:51 +00:00
|
|
|
}
|
|
|
|
|
2022-03-17 17:56:14 +00:00
|
|
|
// EvaluationStub is used to serialize parts of an evaluation returned in the
|
|
|
|
// RelatedEvals field of an Evaluation.
|
|
|
|
type EvaluationStub struct {
|
|
|
|
ID string
|
|
|
|
Priority int
|
|
|
|
Type string
|
|
|
|
TriggeredBy string
|
|
|
|
Namespace string
|
|
|
|
JobID string
|
|
|
|
NodeID string
|
|
|
|
DeploymentID string
|
|
|
|
Status string
|
|
|
|
StatusDescription string
|
|
|
|
WaitUntil time.Time
|
|
|
|
NextEval string
|
|
|
|
PreviousEval string
|
|
|
|
BlockedEval string
|
|
|
|
CreateIndex uint64
|
|
|
|
ModifyIndex uint64
|
|
|
|
CreateTime int64
|
|
|
|
ModifyTime int64
|
|
|
|
}
|
|
|
|
|
2022-07-06 14:30:11 +00:00
|
|
|
type EvalDeleteRequest struct {
|
|
|
|
EvalIDs []string
|
eval delete: move batching of deletes into RPC handler and state (#15117)
During unusual outage recovery scenarios on large clusters, a backlog of
millions of evaluations can appear. In these cases, the `eval delete` command can
put excessive load on the cluster by listing large sets of evals to extract the
IDs and then sending larges batches of IDs. Although the command's batch size
was carefully tuned, we still need to be JSON deserialize, re-serialize to
MessagePack, send the log entries through raft, and get the FSM applied.
To improve performance of this recovery case, move the batching process into the
RPC handler and the state store. The design here is a little weird, so let's
look a the failed options first:
* A naive solution here would be to just send the filter as the raft request and
let the FSM apply delete the whole set in a single operation. Benchmarking with
1M evals on a 3 node cluster demonstrated this can block the FSM apply for
several minutes, which puts the cluster at risk if there's a leadership
failover (the barrier write can't be made while this apply is in-flight).
* A less naive but still bad solution would be to have the RPC handler filter
and paginate, and then hand a list of IDs to the existing raft log
entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and
took roughly an hour to complete.
Instead, we're filtering and paginating in the RPC handler to find a page token,
and then passing both the filter and page token in the raft log. The FSM apply
recreates the paginator using the filter and page token to get roughly the same
page of evaluations, which it then deletes. The pagination process is fairly
cheap (only abut 5% of the total FSM apply time), so counter-intuitively this
rework ends up being much faster. A benchmark of 1M evaluations showed this
blocked the FSM apply for 20-30ms at a time (typical for normal operations) and
completes in less than 4 minutes.
Note that, as with the existing design, this delete is not consistent: a new
evaluation inserted "behind" the cursor of the pagination will fail to be
deleted.
2022-11-14 19:08:13 +00:00
|
|
|
Filter string
|
2022-07-06 14:30:11 +00:00
|
|
|
WriteRequest
|
|
|
|
}
|
|
|
|
|
eval delete: move batching of deletes into RPC handler and state (#15117)
During unusual outage recovery scenarios on large clusters, a backlog of
millions of evaluations can appear. In these cases, the `eval delete` command can
put excessive load on the cluster by listing large sets of evals to extract the
IDs and then sending larges batches of IDs. Although the command's batch size
was carefully tuned, we still need to be JSON deserialize, re-serialize to
MessagePack, send the log entries through raft, and get the FSM applied.
To improve performance of this recovery case, move the batching process into the
RPC handler and the state store. The design here is a little weird, so let's
look a the failed options first:
* A naive solution here would be to just send the filter as the raft request and
let the FSM apply delete the whole set in a single operation. Benchmarking with
1M evals on a 3 node cluster demonstrated this can block the FSM apply for
several minutes, which puts the cluster at risk if there's a leadership
failover (the barrier write can't be made while this apply is in-flight).
* A less naive but still bad solution would be to have the RPC handler filter
and paginate, and then hand a list of IDs to the existing raft log
entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and
took roughly an hour to complete.
Instead, we're filtering and paginating in the RPC handler to find a page token,
and then passing both the filter and page token in the raft log. The FSM apply
recreates the paginator using the filter and page token to get roughly the same
page of evaluations, which it then deletes. The pagination process is fairly
cheap (only abut 5% of the total FSM apply time), so counter-intuitively this
rework ends up being much faster. A benchmark of 1M evaluations showed this
blocked the FSM apply for 20-30ms at a time (typical for normal operations) and
completes in less than 4 minutes.
Note that, as with the existing design, this delete is not consistent: a new
evaluation inserted "behind" the cursor of the pagination will fail to be
deleted.
2022-11-14 19:08:13 +00:00
|
|
|
type EvalDeleteResponse struct {
|
|
|
|
Count int
|
|
|
|
}
|
|
|
|
|
2022-11-07 13:53:19 +00:00
|
|
|
type EvalCountResponse struct {
|
|
|
|
Count int
|
|
|
|
QueryMeta
|
|
|
|
}
|
|
|
|
|
2015-09-17 19:40:51 +00:00
|
|
|
// EvalIndexSort is a wrapper to sort evaluations by CreateIndex.
|
|
|
|
// We reverse the test so that we get the highest index first.
|
|
|
|
type EvalIndexSort []*Evaluation
|
|
|
|
|
|
|
|
func (e EvalIndexSort) Len() int {
|
|
|
|
return len(e)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e EvalIndexSort) Less(i, j int) bool {
|
|
|
|
return e[i].CreateIndex > e[j].CreateIndex
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e EvalIndexSort) Swap(i, j int) {
|
|
|
|
e[i], e[j] = e[j], e[i]
|
2015-09-09 01:42:34 +00:00
|
|
|
}
|