open-nomad/command/agent/eval_endpoint_test.go

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

424 lines
12 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
2015-09-06 23:02:53 +00:00
package agent
import (
"fmt"
2015-09-06 23:02:53 +00:00
"net/http"
"net/http/httptest"
"net/url"
2015-09-06 23:02:53 +00:00
"testing"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper/uuid"
2015-09-06 23:02:53 +00:00
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
2015-09-06 23:02:53 +00:00
)
func TestHTTP_EvalList(t *testing.T) {
ci.Parallel(t)
2017-07-20 05:14:36 +00:00
httpTest(t, nil, func(s *TestAgent) {
2015-09-06 23:02:53 +00:00
// Directly manipulate the state
state := s.Agent.server.State()
eval1 := mock.Eval()
eval2 := mock.Eval()
err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval1, eval2})
require.NoError(t, err)
2015-09-06 23:02:53 +00:00
// simple list request
req, err := http.NewRequest(http.MethodGet, "/v1/evaluations", nil)
require.NoError(t, err)
2015-09-06 23:02:53 +00:00
respW := httptest.NewRecorder()
obj, err := s.Server.EvalsRequest(respW, req)
require.NoError(t, err)
2015-09-06 23:02:53 +00:00
// check headers and response body
require.NotEqual(t, "", respW.Result().Header.Get("X-Nomad-Index"), "missing index")
require.Equal(t, "true", respW.Result().Header.Get("X-Nomad-KnownLeader"), "missing known leader")
require.NotEqual(t, "", respW.Result().Header.Get("X-Nomad-LastContact"), "missing last contact")
require.Len(t, obj.([]*structs.Evaluation), 2, "expected 2 evals")
// paginated list request
req, err = http.NewRequest(http.MethodGet, "/v1/evaluations?per_page=1", nil)
require.NoError(t, err)
respW = httptest.NewRecorder()
obj, err = s.Server.EvalsRequest(respW, req)
require.NoError(t, err)
// check response body
require.Len(t, obj.([]*structs.Evaluation), 1, "expected 1 eval")
// filtered list request
req, err = http.NewRequest(http.MethodGet,
fmt.Sprintf("/v1/evaluations?per_page=10&job=%s", eval2.JobID), nil)
require.NoError(t, err)
respW = httptest.NewRecorder()
obj, err = s.Server.EvalsRequest(respW, req)
require.NoError(t, err)
// check response body
require.Len(t, obj.([]*structs.Evaluation), 1, "expected 1 eval")
2015-09-06 23:02:53 +00:00
})
}
func TestHTTP_EvalPrefixList(t *testing.T) {
ci.Parallel(t)
2017-07-20 05:14:36 +00:00
httpTest(t, nil, func(s *TestAgent) {
// Directly manipulate the state
state := s.Agent.server.State()
eval1 := mock.Eval()
eval1.ID = "aaabbbbb-e8f7-fd38-c855-ab94ceb89706"
eval2 := mock.Eval()
eval2.ID = "aaabbbbb-e8f7-fd38-c855-ab94ceb89706"
err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval1, eval2})
if err != nil {
t.Fatalf("err: %v", err)
}
// Make the HTTP request
req, err := http.NewRequest(http.MethodGet, "/v1/evaluations?prefix=aaab", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
respW := httptest.NewRecorder()
// Make the request
obj, err := s.Server.EvalsRequest(respW, req)
if err != nil {
t.Fatalf("err: %v", err)
}
// Check for the index
if respW.Result().Header.Get("X-Nomad-Index") == "" {
t.Fatalf("missing index")
}
if respW.Result().Header.Get("X-Nomad-KnownLeader") != "true" {
t.Fatalf("missing known leader")
}
if respW.Result().Header.Get("X-Nomad-LastContact") == "" {
t.Fatalf("missing last contact")
}
// Check the eval
e := obj.([]*structs.Evaluation)
if len(e) != 1 {
t.Fatalf("bad: %#v", e)
}
// Check the identifier
if e[0].ID != eval2.ID {
t.Fatalf("expected eval ID: %v, Actual: %v", eval2.ID, e[0].ID)
}
})
}
func TestHTTP_EvalsDelete(t *testing.T) {
ci.Parallel(t)
testCases := []struct {
testFn func()
name string
}{
{
testFn: func() {
httpTest(t, nil, func(s *TestAgent) {
// Create an empty request object which doesn't contain any
// eval IDs.
deleteReq := api.EvalDeleteRequest{}
buf := encodeReq(&deleteReq)
// Generate the HTTP request.
req, err := http.NewRequest(http.MethodDelete, "/v1/evaluations", buf)
require.NoError(t, err)
respW := httptest.NewRecorder()
// Make the request and check the response.
obj, err := s.Server.EvalsRequest(respW, req)
require.Equal(t,
eval delete: move batching of deletes into RPC handler and state (#15117) During unusual outage recovery scenarios on large clusters, a backlog of millions of evaluations can appear. In these cases, the `eval delete` command can put excessive load on the cluster by listing large sets of evals to extract the IDs and then sending larges batches of IDs. Although the command's batch size was carefully tuned, we still need to be JSON deserialize, re-serialize to MessagePack, send the log entries through raft, and get the FSM applied. To improve performance of this recovery case, move the batching process into the RPC handler and the state store. The design here is a little weird, so let's look a the failed options first: * A naive solution here would be to just send the filter as the raft request and let the FSM apply delete the whole set in a single operation. Benchmarking with 1M evals on a 3 node cluster demonstrated this can block the FSM apply for several minutes, which puts the cluster at risk if there's a leadership failover (the barrier write can't be made while this apply is in-flight). * A less naive but still bad solution would be to have the RPC handler filter and paginate, and then hand a list of IDs to the existing raft log entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and took roughly an hour to complete. Instead, we're filtering and paginating in the RPC handler to find a page token, and then passing both the filter and page token in the raft log. The FSM apply recreates the paginator using the filter and page token to get roughly the same page of evaluations, which it then deletes. The pagination process is fairly cheap (only abut 5% of the total FSM apply time), so counter-intuitively this rework ends up being much faster. A benchmark of 1M evaluations showed this blocked the FSM apply for 20-30ms at a time (typical for normal operations) and completes in less than 4 minutes. Note that, as with the existing design, this delete is not consistent: a new evaluation inserted "behind" the cursor of the pagination will fail to be deleted.
2022-11-14 19:08:13 +00:00
CodedError(http.StatusBadRequest, "evals must be deleted by either ID or filter"), err)
require.Nil(t, obj)
})
},
name: "too few eval IDs",
},
{
testFn: func() {
httpTest(t, nil, func(s *TestAgent) {
deleteReq := api.EvalDeleteRequest{EvalIDs: make([]string, 8000)}
// Generate a UUID and add it 8000 times to the eval ID
// request array.
evalID := uuid.Generate()
for i := 0; i < 8000; i++ {
deleteReq.EvalIDs[i] = evalID
}
buf := encodeReq(&deleteReq)
// Generate the HTTP request.
req, err := http.NewRequest(http.MethodDelete, "/v1/evaluations", buf)
require.NoError(t, err)
respW := httptest.NewRecorder()
// Make the request and check the response.
obj, err := s.Server.EvalsRequest(respW, req)
require.Equal(t,
CodedError(http.StatusBadRequest,
eval delete: move batching of deletes into RPC handler and state (#15117) During unusual outage recovery scenarios on large clusters, a backlog of millions of evaluations can appear. In these cases, the `eval delete` command can put excessive load on the cluster by listing large sets of evals to extract the IDs and then sending larges batches of IDs. Although the command's batch size was carefully tuned, we still need to be JSON deserialize, re-serialize to MessagePack, send the log entries through raft, and get the FSM applied. To improve performance of this recovery case, move the batching process into the RPC handler and the state store. The design here is a little weird, so let's look a the failed options first: * A naive solution here would be to just send the filter as the raft request and let the FSM apply delete the whole set in a single operation. Benchmarking with 1M evals on a 3 node cluster demonstrated this can block the FSM apply for several minutes, which puts the cluster at risk if there's a leadership failover (the barrier write can't be made while this apply is in-flight). * A less naive but still bad solution would be to have the RPC handler filter and paginate, and then hand a list of IDs to the existing raft log entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and took roughly an hour to complete. Instead, we're filtering and paginating in the RPC handler to find a page token, and then passing both the filter and page token in the raft log. The FSM apply recreates the paginator using the filter and page token to get roughly the same page of evaluations, which it then deletes. The pagination process is fairly cheap (only abut 5% of the total FSM apply time), so counter-intuitively this rework ends up being much faster. A benchmark of 1M evaluations showed this blocked the FSM apply for 20-30ms at a time (typical for normal operations) and completes in less than 4 minutes. Note that, as with the existing design, this delete is not consistent: a new evaluation inserted "behind" the cursor of the pagination will fail to be deleted.
2022-11-14 19:08:13 +00:00
"request includes 8000 evaluation IDs, must be 7281 or fewer"), err)
require.Nil(t, obj)
})
},
name: "too many eval IDs",
},
{
testFn: func() {
httpTest(t, func(c *Config) {
c.NomadConfig.DefaultSchedulerConfig.PauseEvalBroker = true
}, func(s *TestAgent) {
// Generate a request with an eval ID that doesn't exist
// within state.
deleteReq := api.EvalDeleteRequest{EvalIDs: []string{uuid.Generate()}}
buf := encodeReq(&deleteReq)
// Generate the HTTP request.
req, err := http.NewRequest(http.MethodDelete, "/v1/evaluations", buf)
require.NoError(t, err)
respW := httptest.NewRecorder()
// Make the request and check the response.
obj, err := s.Server.EvalsRequest(respW, req)
require.Contains(t, err.Error(), "eval not found")
require.Nil(t, obj)
})
},
name: "eval doesn't exist",
},
{
testFn: func() {
httpTest(t, func(c *Config) {
c.NomadConfig.DefaultSchedulerConfig.PauseEvalBroker = true
}, func(s *TestAgent) {
// Upsert an eval into state.
mockEval := mock.Eval()
err := s.Agent.server.State().UpsertEvals(
structs.MsgTypeTestSetup, 10, []*structs.Evaluation{mockEval})
require.NoError(t, err)
// Generate a request with the ID of the eval previously upserted.
deleteReq := api.EvalDeleteRequest{EvalIDs: []string{mockEval.ID}}
buf := encodeReq(&deleteReq)
// Generate the HTTP request.
req, err := http.NewRequest(http.MethodDelete, "/v1/evaluations", buf)
require.NoError(t, err)
respW := httptest.NewRecorder()
// Make the request and check the response.
obj, err := s.Server.EvalsRequest(respW, req)
eval delete: move batching of deletes into RPC handler and state (#15117) During unusual outage recovery scenarios on large clusters, a backlog of millions of evaluations can appear. In these cases, the `eval delete` command can put excessive load on the cluster by listing large sets of evals to extract the IDs and then sending larges batches of IDs. Although the command's batch size was carefully tuned, we still need to be JSON deserialize, re-serialize to MessagePack, send the log entries through raft, and get the FSM applied. To improve performance of this recovery case, move the batching process into the RPC handler and the state store. The design here is a little weird, so let's look a the failed options first: * A naive solution here would be to just send the filter as the raft request and let the FSM apply delete the whole set in a single operation. Benchmarking with 1M evals on a 3 node cluster demonstrated this can block the FSM apply for several minutes, which puts the cluster at risk if there's a leadership failover (the barrier write can't be made while this apply is in-flight). * A less naive but still bad solution would be to have the RPC handler filter and paginate, and then hand a list of IDs to the existing raft log entry. Benchmarks showed this blocked the FSM apply for 20-30s at a time and took roughly an hour to complete. Instead, we're filtering and paginating in the RPC handler to find a page token, and then passing both the filter and page token in the raft log. The FSM apply recreates the paginator using the filter and page token to get roughly the same page of evaluations, which it then deletes. The pagination process is fairly cheap (only abut 5% of the total FSM apply time), so counter-intuitively this rework ends up being much faster. A benchmark of 1M evaluations showed this blocked the FSM apply for 20-30ms at a time (typical for normal operations) and completes in less than 4 minutes. Note that, as with the existing design, this delete is not consistent: a new evaluation inserted "behind" the cursor of the pagination will fail to be deleted.
2022-11-14 19:08:13 +00:00
require.NoError(t, err)
require.NotNil(t, obj)
deleteResp := obj.(structs.EvalDeleteResponse)
require.Equal(t, deleteResp.Count, 1)
// Ensure the eval is not found.
readEval, err := s.Agent.server.State().EvalByID(nil, mockEval.ID)
require.NoError(t, err)
require.Nil(t, readEval)
})
},
name: "successfully delete eval",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tc.testFn()
})
}
}
2015-09-06 23:18:25 +00:00
func TestHTTP_EvalAllocations(t *testing.T) {
ci.Parallel(t)
2017-07-20 05:14:36 +00:00
httpTest(t, nil, func(s *TestAgent) {
2015-09-06 23:18:25 +00:00
// Directly manipulate the state
state := s.Agent.server.State()
alloc1 := mock.Alloc()
alloc2 := mock.Alloc()
alloc2.EvalID = alloc1.EvalID
2016-07-25 21:11:32 +00:00
state.UpsertJobSummary(998, mock.JobSummary(alloc1.JobID))
state.UpsertJobSummary(999, mock.JobSummary(alloc2.JobID))
err := state.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{alloc1, alloc2})
2015-09-06 23:18:25 +00:00
if err != nil {
t.Fatalf("err: %v", err)
}
// Make the HTTP request
req, err := http.NewRequest(http.MethodGet,
2015-09-06 23:18:25 +00:00
"/v1/evaluation/"+alloc1.EvalID+"/allocations", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
respW := httptest.NewRecorder()
// Make the request
obj, err := s.Server.EvalSpecificRequest(respW, req)
if err != nil {
t.Fatalf("err: %v", err)
}
// Check for the index
if respW.Result().Header.Get("X-Nomad-Index") == "" {
2015-09-06 23:18:25 +00:00
t.Fatalf("missing index")
}
if respW.Result().Header.Get("X-Nomad-KnownLeader") != "true" {
2015-09-06 23:18:25 +00:00
t.Fatalf("missing known leader")
}
if respW.Result().Header.Get("X-Nomad-LastContact") == "" {
2015-09-06 23:18:25 +00:00
t.Fatalf("missing last contact")
}
2018-03-11 18:35:30 +00:00
// Check the output
2015-09-06 23:18:25 +00:00
allocs := obj.([]*structs.AllocListStub)
if len(allocs) != 2 {
t.Fatalf("bad: %#v", allocs)
}
})
}
2015-09-06 23:02:53 +00:00
func TestHTTP_EvalQuery(t *testing.T) {
ci.Parallel(t)
2017-07-20 05:14:36 +00:00
httpTest(t, nil, func(s *TestAgent) {
2015-09-06 23:02:53 +00:00
// Directly manipulate the state
state := s.Agent.server.State()
eval := mock.Eval()
err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval})
2015-09-06 23:02:53 +00:00
if err != nil {
t.Fatalf("err: %v", err)
}
// Make the HTTP request
req, err := http.NewRequest(http.MethodGet, "/v1/evaluation/"+eval.ID, nil)
2015-09-06 23:02:53 +00:00
if err != nil {
t.Fatalf("err: %v", err)
}
respW := httptest.NewRecorder()
// Make the request
obj, err := s.Server.EvalSpecificRequest(respW, req)
if err != nil {
t.Fatalf("err: %v", err)
}
// Check for the index
if respW.Result().Header.Get("X-Nomad-Index") == "" {
2015-09-06 23:02:53 +00:00
t.Fatalf("missing index")
}
if respW.Result().Header.Get("X-Nomad-KnownLeader") != "true" {
2015-09-06 23:02:53 +00:00
t.Fatalf("missing known leader")
}
if respW.Result().Header.Get("X-Nomad-LastContact") == "" {
2015-09-06 23:02:53 +00:00
t.Fatalf("missing last contact")
}
// Check the job
e := obj.(*structs.Evaluation)
if e.ID != eval.ID {
t.Fatalf("bad: %#v", e)
}
})
}
func TestHTTP_EvalQueryWithRelated(t *testing.T) {
ci.Parallel(t)
httpTest(t, nil, func(s *TestAgent) {
// Directly manipulate the state
state := s.Agent.server.State()
eval1 := mock.Eval()
eval2 := mock.Eval()
// Link related evals
eval1.NextEval = eval2.ID
eval2.PreviousEval = eval1.ID
err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval1, eval2})
require.NoError(t, err)
// Make the HTTP request
req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("/v1/evaluation/%s?related=true", eval1.ID), nil)
require.NoError(t, err)
respW := httptest.NewRecorder()
// Make the request
obj, err := s.Server.EvalSpecificRequest(respW, req)
require.NoError(t, err)
// Check for the index
require.NotEmpty(t, respW.Result().Header.Get("X-Nomad-Index"))
require.NotEmpty(t, respW.Result().Header.Get("X-Nomad-KnownLeader"))
require.NotEmpty(t, respW.Result().Header.Get("X-Nomad-LastContact"))
// Check the eval
e := obj.(*structs.Evaluation)
require.Equal(t, eval1.ID, e.ID)
// Check for the related evals
expected := []*structs.EvaluationStub{
eval2.Stub(),
}
require.Equal(t, expected, e.RelatedEvals)
})
}
func TestHTTP_EvalCount(t *testing.T) {
ci.Parallel(t)
httpTest(t, nil, func(s *TestAgent) {
// Directly manipulate the state
state := s.Agent.server.State()
eval1 := mock.Eval()
eval2 := mock.Eval()
err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval1, eval2})
must.NoError(t, err)
// simple count request
req, err := http.NewRequest(http.MethodGet, "/v1/evaluations/count", nil)
must.NoError(t, err)
respW := httptest.NewRecorder()
obj, err := s.Server.EvalsCountRequest(respW, req)
must.NoError(t, err)
// check headers and response body
must.NotEq(t, "", respW.Result().Header.Get("X-Nomad-Index"),
must.Sprint("missing index"))
must.Eq(t, "true", respW.Result().Header.Get("X-Nomad-KnownLeader"),
must.Sprint("missing known leader"))
must.NotEq(t, "", respW.Result().Header.Get("X-Nomad-LastContact"),
must.Sprint("missing last contact"))
resp := obj.(*structs.EvalCountResponse)
must.Eq(t, resp.Count, 2)
// filtered count request
v := url.Values{}
v.Add("filter", fmt.Sprintf("JobID==\"%s\"", eval2.JobID))
req, err = http.NewRequest(http.MethodGet, "/v1/evaluations/count?"+v.Encode(), nil)
must.NoError(t, err)
respW = httptest.NewRecorder()
obj, err = s.Server.EvalsCountRequest(respW, req)
must.NoError(t, err)
resp = obj.(*structs.EvalCountResponse)
must.Eq(t, resp.Count, 1)
})
}