open-nomad/command/agent/metrics_endpoint_test.go

package agent

import (
	"net/http"
	"net/http/httptest"
	"strings"
	"testing"
	"time"

	"github.com/armon/go-metrics"
	"github.com/hashicorp/nomad/nomad/mock"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/testutil"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestHTTP_MetricsWithIllegalMethod(t *testing.T) {
	assert := assert.New(t)

	t.Parallel()
	httpTest(t, nil, func(s *TestAgent) {
		req, err := http.NewRequest("DELETE", "/v1/metrics", nil)
		assert.Nil(err)
		respW := httptest.NewRecorder()

		_, err = s.Server.MetricsRequest(respW, req)
		assert.NotNil(err, "HTTP DELETE should not be accepted for this endpoint")
	})
}

func TestHTTP_Metrics(t *testing.T) {
	assert := assert.New(t)

	t.Parallel()
	httpTest(t, nil, func(s *TestAgent) {
		// make a separate HTTP request first, to ensure Nomad has written metrics
		// and prevent a race condition
		req, err := http.NewRequest("GET", "/v1/agent/self", nil)
		assert.Nil(err)
		respW := httptest.NewRecorder()
		s.Server.AgentSelfRequest(respW, req)

		// now make a metrics endpoint request, which should be already initialized
		// and written to
		req, err = http.NewRequest("GET", "/v1/metrics", nil)
		assert.Nil(err)
		respW = httptest.NewRecorder()

		testutil.WaitForResult(func() (bool, error) {
			resp, err := s.Server.MetricsRequest(respW, req)
			if err != nil {
				return false, err
			}
			respW.Flush()

			res := resp.(metrics.MetricsSummary)
			return len(res.Gauges) != 0, nil
		}, func(err error) {
			t.Fatalf("should have metrics: %v", err)
		})
	})
}

// When emitting metrics, the client should use the local copy of the allocs with
// updated task states (not the copy submitted by the server).
func TestHTTP_FreshClientAllocMetrics(t *testing.T) {
	t.Parallel()
	require := require.New(t)
	numTasks := 10

	httpTest(t, func(c *Config) {
		c.Telemetry.PublishAllocationMetrics = true
		c.Telemetry.PublishNodeMetrics = true
		c.Telemetry.BackwardsCompatibleMetrics = false
		c.Telemetry.DisableTaggedMetrics = false
	}, func(s *TestAgent) {
		// Create the job, wait for it to finish
		job := mock.BatchJob()
		job.TaskGroups[0].Count = numTasks
		testutil.RegisterJob(t, s.RPC, job)
		testutil.WaitForResult(func() (bool, error) {
			time.Sleep(200 * time.Millisecond)
			args := &structs.JobSpecificRequest{}
			args.JobID = job.ID
			args.QueryOptions.Region = "global"
			var resp structs.SingleJobResponse
			err := s.RPC("Job.GetJob", args, &resp)
			return err == nil && resp.Job.Status == "dead", err
		}, func(err error) {
			require.Fail("timed-out waiting for job to complete")
		})

		nodeID := s.client.NodeID()

		// wait for metrics to converge
		var pending, running, terminal float32 = -1.0, -1.0, -1.0
		testutil.WaitForResultRetries(100, func() (bool, error) {
			time.Sleep(100 * time.Millisecond)
			req, err := http.NewRequest("GET", "/v1/metrics", nil)
			require.NoError(err)
			respW := httptest.NewRecorder()

			obj, err := s.Server.MetricsRequest(respW, req)
			if err != nil {
				return false, err
			}

			metrics := obj.(metrics.MetricsSummary)
			for _, g := range metrics.Gauges {

				// ignore client metrics belonging to other test nodes
				// from other tests that contaminate go-metrics reporting
				if g.DisplayLabels["node_id"] != nodeID {
					continue
				}

				if strings.HasSuffix(g.Name, "client.allocations.pending") {
					pending = g.Value
				}
				if strings.HasSuffix(g.Name, "client.allocations.running") {
					running = g.Value
				}
				if strings.HasSuffix(g.Name, "client.allocations.terminal") {
					terminal = g.Value
				}
			}
			// client alloc metrics should reflect that there is numTasks terminal allocs and no other allocs
			return pending == float32(0) && running == float32(0) &&
				terminal == float32(numTasks), nil
		}, func(err error) {
			require.Fail("timed out waiting for metrics to converge",
				"expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal)
		})
	})
}
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`package agent`

			`import (`
			`"net/http"`
			`"net/http/httptest"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"strings"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`"testing"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"time"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"github.com/armon/go-metrics"`
			`"github.com/hashicorp/nomad/nomad/mock"`
			`"github.com/hashicorp/nomad/nomad/structs"`
metrics test 2017-10-24 01:38:36 +00:00			`"github.com/hashicorp/nomad/testutil"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`"github.com/stretchr/testify/assert"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"github.com/stretchr/testify/require"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`)`

			`func TestHTTP_MetricsWithIllegalMethod(t *testing.T) {`
			`assert := assert.New(t)`

			`t.Parallel()`
			`httpTest(t, nil, func(s *TestAgent) {`
			`req, err := http.NewRequest("DELETE", "/v1/metrics", nil)`
			`assert.Nil(err)`
			`respW := httptest.NewRecorder()`

			`_, err = s.Server.MetricsRequest(respW, req)`
			`assert.NotNil(err, "HTTP DELETE should not be accepted for this endpoint")`
			`})`
			`}`

			`func TestHTTP_Metrics(t *testing.T) {`
			`assert := assert.New(t)`

			`t.Parallel()`
			`httpTest(t, nil, func(s *TestAgent) {`
			`// make a separate HTTP request first, to ensure Nomad has written metrics`
			`// and prevent a race condition`
			`req, err := http.NewRequest("GET", "/v1/agent/self", nil)`
			`assert.Nil(err)`
			`respW := httptest.NewRecorder()`
			`s.Server.AgentSelfRequest(respW, req)`

			`// now make a metrics endpoint request, which should be already initialized`
			`// and written to`
			`req, err = http.NewRequest("GET", "/v1/metrics", nil)`
			`assert.Nil(err)`
			`respW = httptest.NewRecorder()`

metrics test 2017-10-24 01:38:36 +00:00			`testutil.WaitForResult(func() (bool, error) {`
			`resp, err := s.Server.MetricsRequest(respW, req)`
			`if err != nil {`
			`return false, err`
			`}`
			`respW.Flush()`

			`res := resp.(metrics.MetricsSummary)`
			`return len(res.Gauges) != 0, nil`
			`}, func(err error) {`
			`t.Fatalf("should have metrics: %v", err)`
			`})`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`})`
			`}`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00
			`// When emitting metrics, the client should use the local copy of the allocs with`
			`// updated task states (not the copy submitted by the server).`
			`func TestHTTP_FreshClientAllocMetrics(t *testing.T) {`
			`t.Parallel()`
			`require := require.New(t)`
			`numTasks := 10`

			`httpTest(t, func(c *Config) {`
			`c.Telemetry.PublishAllocationMetrics = true`
			`c.Telemetry.PublishNodeMetrics = true`
			`c.Telemetry.BackwardsCompatibleMetrics = false`
			`c.Telemetry.DisableTaggedMetrics = false`
			`}, func(s *TestAgent) {`
			`// Create the job, wait for it to finish`
			`job := mock.BatchJob()`
			`job.TaskGroups[0].Count = numTasks`
			`testutil.RegisterJob(t, s.RPC, job)`
			`testutil.WaitForResult(func() (bool, error) {`
			`time.Sleep(200 * time.Millisecond)`
			`args := &structs.JobSpecificRequest{}`
			`args.JobID = job.ID`
			`args.QueryOptions.Region = "global"`
			`var resp structs.SingleJobResponse`
			`err := s.RPC("Job.GetJob", args, &resp)`
			`return err == nil && resp.Job.Status == "dead", err`
			`}, func(err error) {`
			`require.Fail("timed-out waiting for job to complete")`
			`})`

tests: deflake TestHTTP_FreshClientAllocMetrics The test asserts that alloc counts get reported accurately in metrics by inspecting the metrics endpoint directly. Sadly, the metrics as collected by `armon/go-metrics` seem to be stateful and may contain info from other tests. This means that the test can fail depending on the order of returned metrics. Inspecting the metrics output of one failing run, you can see the duplicate guage entries but for different node_ids: ``` { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 10, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "67402bf4-00f3-bd8d-9fa8-f4d1924a892a" } }, { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 0, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "a2945b48-7e66-68e2-c922-49b20dd4e20c" } }, ``` 2019-11-22 23:41:21 +00:00			`nodeID := s.client.NodeID()`

test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`// wait for metrics to converge`
			`var pending, running, terminal float32 = -1.0, -1.0, -1.0`
			`testutil.WaitForResultRetries(100, func() (bool, error) {`
			`time.Sleep(100 * time.Millisecond)`
			`req, err := http.NewRequest("GET", "/v1/metrics", nil)`
			`require.NoError(err)`
			`respW := httptest.NewRecorder()`

			`obj, err := s.Server.MetricsRequest(respW, req)`
			`if err != nil {`
			`return false, err`
			`}`

			`metrics := obj.(metrics.MetricsSummary)`
			`for _, g := range metrics.Gauges {`
tests: deflake TestHTTP_FreshClientAllocMetrics The test asserts that alloc counts get reported accurately in metrics by inspecting the metrics endpoint directly. Sadly, the metrics as collected by `armon/go-metrics` seem to be stateful and may contain info from other tests. This means that the test can fail depending on the order of returned metrics. Inspecting the metrics output of one failing run, you can see the duplicate guage entries but for different node_ids: ``` { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 10, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "67402bf4-00f3-bd8d-9fa8-f4d1924a892a" } }, { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 0, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "a2945b48-7e66-68e2-c922-49b20dd4e20c" } }, ``` 2019-11-22 23:41:21 +00:00
			`// ignore client metrics belonging to other test nodes`
			`// from other tests that contaminate go-metrics reporting`
			`if g.DisplayLabels["node_id"] != nodeID {`
			`continue`
			`}`

test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`if strings.HasSuffix(g.Name, "client.allocations.pending") {`
			`pending = g.Value`
			`}`
			`if strings.HasSuffix(g.Name, "client.allocations.running") {`
			`running = g.Value`
			`}`
			`if strings.HasSuffix(g.Name, "client.allocations.terminal") {`
			`terminal = g.Value`
			`}`
			`}`
			`// client alloc metrics should reflect that there is numTasks terminal allocs and no other allocs`
			`return pending == float32(0) && running == float32(0) &&`
			`terminal == float32(numTasks), nil`
			`}, func(err error) {`
			`require.Fail("timed out waiting for metrics to converge",`
command: Improve metrics fail logging 2019-09-19 02:17:42 +00:00			`"expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal)`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`})`
			`})`
			`}`