open-nomad/command/agent/metrics_endpoint_test.go

package agent

import (
	"net/http"
	"net/http/httptest"
	"strings"
	"testing"
	"time"

	"github.com/armon/go-metrics"
	"github.com/hashicorp/nomad/ci"
	"github.com/hashicorp/nomad/nomad/mock"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/testutil"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestHTTP_MetricsWithIllegalMethod(t *testing.T) {
	ci.Parallel(t)
	assert := assert.New(t)

	httpTest(t, nil, func(s *TestAgent) {
		req, err := http.NewRequest("DELETE", "/v1/metrics", nil)
		assert.Nil(err)
		respW := httptest.NewRecorder()

		_, err = s.Server.MetricsRequest(respW, req)
		assert.NotNil(err, "HTTP DELETE should not be accepted for this endpoint")
	})
}

func TestHTTP_MetricsPrometheusDisabled(t *testing.T) {
	ci.Parallel(t)
	assert := assert.New(t)

	httpTest(t, func(c *Config) { c.Telemetry.PrometheusMetrics = false }, func(s *TestAgent) {
		req, err := http.NewRequest("GET", "/v1/metrics?format=prometheus", nil)
		assert.Nil(err)

		resp, err := s.Server.MetricsRequest(nil, req)
		assert.Nil(resp)
		assert.Error(err, "Prometheus is not enabled")
	})
}

func TestHTTP_MetricsPrometheusEnabled(t *testing.T) {
	ci.Parallel(t)
	assert := assert.New(t)

	httpTest(t, nil, func(s *TestAgent) {
		req, err := http.NewRequest("GET", "/v1/metrics?format=prometheus", nil)
		assert.Nil(err)
		respW := httptest.NewRecorder()

		resp, err := s.Server.MetricsRequest(respW, req)
		assert.Nil(resp)
		assert.Nil(err)

		// Ensure the response body is not empty and that it contains something
		// that looks like a metric we expect.
		assert.NotNil(respW.Body)
		assert.Contains(respW.Body.String(), "HELP go_gc_duration_seconds")
	})
}

func TestHTTP_Metrics(t *testing.T) {
	ci.Parallel(t)
	assert := assert.New(t)

	httpTest(t, nil, func(s *TestAgent) {
		// make a separate HTTP request first, to ensure Nomad has written metrics
		// and prevent a race condition
		req, err := http.NewRequest("GET", "/v1/agent/self", nil)
		assert.Nil(err)
		respW := httptest.NewRecorder()
		s.Server.AgentSelfRequest(respW, req)

		// now make a metrics endpoint request, which should be already initialized
		// and written to
		req, err = http.NewRequest("GET", "/v1/metrics", nil)
		assert.Nil(err)
		respW = httptest.NewRecorder()

		testutil.WaitForResult(func() (bool, error) {
			resp, err := s.Server.MetricsRequest(respW, req)
			if err != nil {
				return false, err
			}
			respW.Flush()

			res := resp.(metrics.MetricsSummary)
			return len(res.Gauges) != 0, nil
		}, func(err error) {
			t.Fatalf("should have metrics: %v", err)
		})
	})
}

// When emitting metrics, the client should use the local copy of the allocs with
// updated task states (not the copy submitted by the server).
//
// **Cannot** be run in parallel as metrics are global.
func TestHTTP_FreshClientAllocMetrics(t *testing.T) {
	ci.Parallel(t)

	require := require.New(t)
	numTasks := 10

	httpTest(t, func(c *Config) {
		c.Telemetry.PublishAllocationMetrics = true
		c.Telemetry.PublishNodeMetrics = true
	}, func(s *TestAgent) {
		// Create the job, wait for it to finish
		job := mock.BatchJob()
		job.TaskGroups[0].Count = numTasks
		testutil.RegisterJob(t, s.RPC, job)
		testutil.WaitForResult(func() (bool, error) {
			time.Sleep(200 * time.Millisecond)
			args := &structs.JobSpecificRequest{}
			args.JobID = job.ID
			args.QueryOptions.Region = "global"
			var resp structs.SingleJobResponse
			err := s.RPC("Job.GetJob", args, &resp)
			return err == nil && resp.Job.Status == "dead", err
		}, func(err error) {
			require.Fail("timed-out waiting for job to complete")
		})

		nodeID := s.client.NodeID()

		// wait for metrics to converge
		var pending, running, terminal float32 = -1.0, -1.0, -1.0
		testutil.WaitForResultRetries(100, func() (bool, error) {
			time.Sleep(100 * time.Millisecond)
			req, err := http.NewRequest("GET", "/v1/metrics", nil)
			require.NoError(err)
			respW := httptest.NewRecorder()

			obj, err := s.Server.MetricsRequest(respW, req)
			if err != nil {
				return false, err
			}

			metrics := obj.(metrics.MetricsSummary)
			for _, g := range metrics.Gauges {

				// ignore client metrics belonging to other test nodes
				// from other tests that contaminate go-metrics reporting
				if g.DisplayLabels["node_id"] != nodeID {
					continue
				}

				if strings.HasSuffix(g.Name, "client.allocations.pending") {
					pending = g.Value
				}
				if strings.HasSuffix(g.Name, "client.allocations.running") {
					running = g.Value
				}
				if strings.HasSuffix(g.Name, "client.allocations.terminal") {
					terminal = g.Value
				}
			}
			// client alloc metrics should reflect that there is numTasks terminal allocs and no other allocs
			return pending == float32(0) && running == float32(0) &&
				terminal == float32(numTasks), nil
		}, func(err error) {
			require.Fail("timed out waiting for metrics to converge",
				"expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal)
		})
	})
}
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`package agent`

			`import (`
			`"net/http"`
			`"net/http/httptest"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"strings"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`"testing"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"time"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"github.com/armon/go-metrics"`
ci: swap ci parallelization for unconstrained gomaxprocs 2022-03-15 12:42:43 +00:00			`"github.com/hashicorp/nomad/ci"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"github.com/hashicorp/nomad/nomad/mock"`
			`"github.com/hashicorp/nomad/nomad/structs"`
metrics test 2017-10-24 01:38:36 +00:00			`"github.com/hashicorp/nomad/testutil"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`"github.com/stretchr/testify/assert"`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`"github.com/stretchr/testify/require"`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`)`

			`func TestHTTP_MetricsWithIllegalMethod(t *testing.T) {`
ci: swap ci parallelization for unconstrained gomaxprocs 2022-03-15 12:42:43 +00:00			`ci.Parallel(t)`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`assert := assert.New(t)`

			`httpTest(t, nil, func(s *TestAgent) {`
			`req, err := http.NewRequest("DELETE", "/v1/metrics", nil)`
			`assert.Nil(err)`
			`respW := httptest.NewRecorder()`

			`_, err = s.Server.MetricsRequest(respW, req)`
			`assert.NotNil(err, "HTTP DELETE should not be accepted for this endpoint")`
			`})`
			`}`

agent: return req error if prometheus metrics are disabled. If the user has disabled Prometheus metrics and a request is sent to the metrics endpoint requesting Prometheus formatted metrics, then the request should fail. 2021-03-09 14:28:58 +00:00			`func TestHTTP_MetricsPrometheusDisabled(t *testing.T) {`
ci: swap ci parallelization for unconstrained gomaxprocs 2022-03-15 12:42:43 +00:00			`ci.Parallel(t)`
agent: return req error if prometheus metrics are disabled. If the user has disabled Prometheus metrics and a request is sent to the metrics endpoint requesting Prometheus formatted metrics, then the request should fail. 2021-03-09 14:28:58 +00:00			`assert := assert.New(t)`

			`httpTest(t, func(c Config) { c.Telemetry.PrometheusMetrics = false }, func(s TestAgent) {`
			`req, err := http.NewRequest("GET", "/v1/metrics?format=prometheus", nil)`
			`assert.Nil(err)`

			`resp, err := s.Server.MetricsRequest(nil, req)`
			`assert.Nil(resp)`
			`assert.Error(err, "Prometheus is not enabled")`
			`})`
			`}`

			`func TestHTTP_MetricsPrometheusEnabled(t *testing.T) {`
ci: swap ci parallelization for unconstrained gomaxprocs 2022-03-15 12:42:43 +00:00			`ci.Parallel(t)`
agent: return req error if prometheus metrics are disabled. If the user has disabled Prometheus metrics and a request is sent to the metrics endpoint requesting Prometheus formatted metrics, then the request should fail. 2021-03-09 14:28:58 +00:00			`assert := assert.New(t)`

			`httpTest(t, nil, func(s *TestAgent) {`
			`req, err := http.NewRequest("GET", "/v1/metrics?format=prometheus", nil)`
			`assert.Nil(err)`
			`respW := httptest.NewRecorder()`

			`resp, err := s.Server.MetricsRequest(respW, req)`
			`assert.Nil(resp)`
			`assert.Nil(err)`

			`// Ensure the response body is not empty and that it contains something`
			`// that looks like a metric we expect.`
			`assert.NotNil(respW.Body)`
			`assert.Contains(respW.Body.String(), "HELP go_gc_duration_seconds")`
			`})`
			`}`

add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`func TestHTTP_Metrics(t *testing.T) {`
ci: swap ci parallelization for unconstrained gomaxprocs 2022-03-15 12:42:43 +00:00			`ci.Parallel(t)`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`assert := assert.New(t)`

			`httpTest(t, nil, func(s *TestAgent) {`
			`// make a separate HTTP request first, to ensure Nomad has written metrics`
			`// and prevent a race condition`
			`req, err := http.NewRequest("GET", "/v1/agent/self", nil)`
			`assert.Nil(err)`
			`respW := httptest.NewRecorder()`
			`s.Server.AgentSelfRequest(respW, req)`

			`// now make a metrics endpoint request, which should be already initialized`
			`// and written to`
			`req, err = http.NewRequest("GET", "/v1/metrics", nil)`
			`assert.Nil(err)`
			`respW = httptest.NewRecorder()`

metrics test 2017-10-24 01:38:36 +00:00			`testutil.WaitForResult(func() (bool, error) {`
			`resp, err := s.Server.MetricsRequest(respW, req)`
			`if err != nil {`
			`return false, err`
			`}`
			`respW.Flush()`

			`res := resp.(metrics.MetricsSummary)`
			`return len(res.Gauges) != 0, nil`
			`}, func(err error) {`
			`t.Fatalf("should have metrics: %v", err)`
			`})`
add http endpoint for in memory metrics prevent against flaky test due to timing/initialization issues 2017-09-04 03:50:05 +00:00			`})`
			`}`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00
			`// When emitting metrics, the client should use the local copy of the allocs with`
			`// updated task states (not the copy submitted by the server).`
test: fix flaky TestHTTP_FreshClientAllocMetrics 2020-02-07 23:39:06 +00:00			`//`
			`// Cannot be run in parallel as metrics are global.`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`func TestHTTP_FreshClientAllocMetrics(t *testing.T) {`
ci: swap ci parallelization for unconstrained gomaxprocs 2022-03-15 12:42:43 +00:00			`ci.Parallel(t)`

test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`require := require.New(t)`
			`numTasks := 10`

			`httpTest(t, func(c *Config) {`
			`c.Telemetry.PublishAllocationMetrics = true`
			`c.Telemetry.PublishNodeMetrics = true`
			`}, func(s *TestAgent) {`
			`// Create the job, wait for it to finish`
			`job := mock.BatchJob()`
			`job.TaskGroups[0].Count = numTasks`
			`testutil.RegisterJob(t, s.RPC, job)`
			`testutil.WaitForResult(func() (bool, error) {`
			`time.Sleep(200 * time.Millisecond)`
			`args := &structs.JobSpecificRequest{}`
			`args.JobID = job.ID`
			`args.QueryOptions.Region = "global"`
			`var resp structs.SingleJobResponse`
			`err := s.RPC("Job.GetJob", args, &resp)`
			`return err == nil && resp.Job.Status == "dead", err`
			`}, func(err error) {`
			`require.Fail("timed-out waiting for job to complete")`
			`})`

tests: deflake TestHTTP_FreshClientAllocMetrics The test asserts that alloc counts get reported accurately in metrics by inspecting the metrics endpoint directly. Sadly, the metrics as collected by `armon/go-metrics` seem to be stateful and may contain info from other tests. This means that the test can fail depending on the order of returned metrics. Inspecting the metrics output of one failing run, you can see the duplicate guage entries but for different node_ids: ``` { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 10, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "67402bf4-00f3-bd8d-9fa8-f4d1924a892a" } }, { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 0, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "a2945b48-7e66-68e2-c922-49b20dd4e20c" } }, ``` 2019-11-22 23:41:21 +00:00			`nodeID := s.client.NodeID()`

test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`// wait for metrics to converge`
			`var pending, running, terminal float32 = -1.0, -1.0, -1.0`
			`testutil.WaitForResultRetries(100, func() (bool, error) {`
			`time.Sleep(100 * time.Millisecond)`
			`req, err := http.NewRequest("GET", "/v1/metrics", nil)`
			`require.NoError(err)`
			`respW := httptest.NewRecorder()`

			`obj, err := s.Server.MetricsRequest(respW, req)`
			`if err != nil {`
			`return false, err`
			`}`

			`metrics := obj.(metrics.MetricsSummary)`
			`for _, g := range metrics.Gauges {`
tests: deflake TestHTTP_FreshClientAllocMetrics The test asserts that alloc counts get reported accurately in metrics by inspecting the metrics endpoint directly. Sadly, the metrics as collected by `armon/go-metrics` seem to be stateful and may contain info from other tests. This means that the test can fail depending on the order of returned metrics. Inspecting the metrics output of one failing run, you can see the duplicate guage entries but for different node_ids: ``` { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 10, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "67402bf4-00f3-bd8d-9fa8-f4d1924a892a" } }, { "Name": "service-name.default-0a3ba4b6-2109-485e-be74-6864228aed3d.client.allocations.terminal", "Value": 0, "Labels": { "datacenter": "dc1", "node_class": "none", "node_id": "a2945b48-7e66-68e2-c922-49b20dd4e20c" } }, ``` 2019-11-22 23:41:21 +00:00
			`// ignore client metrics belonging to other test nodes`
			`// from other tests that contaminate go-metrics reporting`
			`if g.DisplayLabels["node_id"] != nodeID {`
			`continue`
			`}`

test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`if strings.HasSuffix(g.Name, "client.allocations.pending") {`
			`pending = g.Value`
			`}`
			`if strings.HasSuffix(g.Name, "client.allocations.running") {`
			`running = g.Value`
			`}`
			`if strings.HasSuffix(g.Name, "client.allocations.terminal") {`
			`terminal = g.Value`
			`}`
			`}`
			`// client alloc metrics should reflect that there is numTasks terminal allocs and no other allocs`
			`return pending == float32(0) && running == float32(0) &&`
			`terminal == float32(numTasks), nil`
			`}, func(err error) {`
			`require.Fail("timed out waiting for metrics to converge",`
command: Improve metrics fail logging 2019-09-19 02:17:42 +00:00			`"expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal)`
test case for 5540 (#5590) * client/metrics: modified metrics to use (updated) client copy of allocation instead of (unupdated) server copy * updated armon/go-metrics to address race condition in DisplayMetrics 2019-04-30 14:31:35 +00:00			`})`
			`})`
			`}`