open-nomad/command/agent/metrics_endpoint_test.go
Mahmood Ali 7b8cfee162 tests: deflake TestHTTP_FreshClientAllocMetrics
The test asserts that alloc counts get reported accurately in metrics by
inspecting the metrics endpoint directly.  Sadly, the metrics as
collected by `armon/go-metrics` seem to be stateful and may contain info
from other tests.

This means that the test can fail depending on the order of returned
metrics.

Inspecting the metrics output of one failing run, you can see the
duplicate guage entries but for different node_ids:

```
    {
      "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal",
      "Value": 10,
      "Labels": {
        "datacenter": "dc1",
        "node_class": "none",
        "node_id": "67402bf4-00f3-bd8d-9fa8-f4d1924a892a"
      }
    },
    {
      "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal",
      "Value": 0,
      "Labels": {
        "datacenter": "dc1",
        "node_class": "none",
        "node_id": "a2945b48-7e66-68e2-c922-49b20dd4e20c"
      }
    },
```
2019-11-22 18:41:21 -05:00

137 lines
4 KiB
Go

package agent
import (
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestHTTP_MetricsWithIllegalMethod(t *testing.T) {
assert := assert.New(t)
t.Parallel()
httpTest(t, nil, func(s *TestAgent) {
req, err := http.NewRequest("DELETE", "/v1/metrics", nil)
assert.Nil(err)
respW := httptest.NewRecorder()
_, err = s.Server.MetricsRequest(respW, req)
assert.NotNil(err, "HTTP DELETE should not be accepted for this endpoint")
})
}
func TestHTTP_Metrics(t *testing.T) {
assert := assert.New(t)
t.Parallel()
httpTest(t, nil, func(s *TestAgent) {
// make a separate HTTP request first, to ensure Nomad has written metrics
// and prevent a race condition
req, err := http.NewRequest("GET", "/v1/agent/self", nil)
assert.Nil(err)
respW := httptest.NewRecorder()
s.Server.AgentSelfRequest(respW, req)
// now make a metrics endpoint request, which should be already initialized
// and written to
req, err = http.NewRequest("GET", "/v1/metrics", nil)
assert.Nil(err)
respW = httptest.NewRecorder()
testutil.WaitForResult(func() (bool, error) {
resp, err := s.Server.MetricsRequest(respW, req)
if err != nil {
return false, err
}
respW.Flush()
res := resp.(metrics.MetricsSummary)
return len(res.Gauges) != 0, nil
}, func(err error) {
t.Fatalf("should have metrics: %v", err)
})
})
}
// When emitting metrics, the client should use the local copy of the allocs with
// updated task states (not the copy submitted by the server).
func TestHTTP_FreshClientAllocMetrics(t *testing.T) {
t.Parallel()
require := require.New(t)
numTasks := 10
httpTest(t, func(c *Config) {
c.Telemetry.PublishAllocationMetrics = true
c.Telemetry.PublishNodeMetrics = true
c.Telemetry.BackwardsCompatibleMetrics = false
c.Telemetry.DisableTaggedMetrics = false
}, func(s *TestAgent) {
// Create the job, wait for it to finish
job := mock.BatchJob()
job.TaskGroups[0].Count = numTasks
testutil.RegisterJob(t, s.RPC, job)
testutil.WaitForResult(func() (bool, error) {
time.Sleep(200 * time.Millisecond)
args := &structs.JobSpecificRequest{}
args.JobID = job.ID
args.QueryOptions.Region = "global"
var resp structs.SingleJobResponse
err := s.RPC("Job.GetJob", args, &resp)
return err == nil && resp.Job.Status == "dead", err
}, func(err error) {
require.Fail("timed-out waiting for job to complete")
})
nodeID := s.client.NodeID()
// wait for metrics to converge
var pending, running, terminal float32 = -1.0, -1.0, -1.0
testutil.WaitForResultRetries(100, func() (bool, error) {
time.Sleep(100 * time.Millisecond)
req, err := http.NewRequest("GET", "/v1/metrics", nil)
require.NoError(err)
respW := httptest.NewRecorder()
obj, err := s.Server.MetricsRequest(respW, req)
if err != nil {
return false, err
}
metrics := obj.(metrics.MetricsSummary)
for _, g := range metrics.Gauges {
// ignore client metrics belonging to other test nodes
// from other tests that contaminate go-metrics reporting
if g.DisplayLabels["node_id"] != nodeID {
continue
}
if strings.HasSuffix(g.Name, "client.allocations.pending") {
pending = g.Value
}
if strings.HasSuffix(g.Name, "client.allocations.running") {
running = g.Value
}
if strings.HasSuffix(g.Name, "client.allocations.terminal") {
terminal = g.Value
}
}
// client alloc metrics should reflect that there is numTasks terminal allocs and no other allocs
return pending == float32(0) && running == float32(0) &&
terminal == float32(numTasks), nil
}, func(err error) {
require.Fail("timed out waiting for metrics to converge",
"expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal)
})
})
}