open-nomad/e2e/metrics/metrics.go
Tim Gross 4ba5691656
e2e: give metrics longer to settle (#6884)
Increase the shortened timeout after the first loop so that metrics
that take longer to come in aren't failing the test unnecessarily.

Move the check for empty alloc metrics into the loop so that if the
first values we get are empty we don't fail the test too early.
2019-12-20 10:39:35 -05:00

206 lines
6.2 KiB
Go

package metrics
import (
"fmt"
"os"
"testing"
"time"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/framework"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type MetricsTest struct {
framework.TC
jobIDs []string
prometheusID string
fabioID string
fabioAddress string
}
func init() {
framework.AddSuites(&framework.TestSuite{
Component: "Metrics",
CanRunLocal: true,
Cases: []framework.TestCase{
new(MetricsTest),
},
})
}
// Stand up prometheus to collect metrics from all clients and allocs,
// with fabio as a system job in front of it so that we don't need to
// have prometheus use host networking
func (tc *MetricsTest) BeforeAll(f *framework.F) {
t := f.T()
e2eutil.WaitForLeader(t, tc.Nomad())
e2eutil.WaitForNodesReady(t, tc.Nomad(), 1)
err := tc.setUpPrometheus(f)
require.Nil(t, err)
}
// Clean up the target jobs after each test case, but keep fabio/prometheus
// for reuse between the two test cases (Windows vs Linux)
func (tc *MetricsTest) AfterEach(f *framework.F) {
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}
for _, jobID := range tc.jobIDs {
tc.Nomad().Jobs().Deregister(jobID, true, nil)
}
tc.jobIDs = []string{}
tc.Nomad().System().GarbageCollect()
}
// Clean up fabio/prometheus
func (tc *MetricsTest) AfterAll(f *framework.F) {
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
return
}
tc.tearDownPrometheus(f)
}
// TestMetricsLinux runs a collection of jobs that exercise alloc metrics.
// Then we query prometheus to verify we're collecting client and alloc metrics
// and correctly presenting them to the prometheus scraper.
func (tc *MetricsTest) TestMetricsLinux(f *framework.F) {
t := f.T()
clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad())
require.Nil(t, err)
if len(clientNodes) == 0 {
t.Skip("no Linux clients")
}
workloads := map[string]string{
"cpustress": "nomad_client_allocs_cpu_user",
"diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats?
"helloworld": "nomad_client_allocs_cpu_allocated",
"memstress": "nomad_client_allocs_memory_usage",
"simpleweb": "nomad_client_allocs_memory_rss",
}
tc.runWorkloads(t, workloads)
tc.queryClientMetrics(t, clientNodes)
tc.queryAllocMetrics(t, workloads)
}
// TestMetricsWindows runs a collection of jobs that exercise alloc metrics.
// Then we query prometheus to verify we're collecting client and alloc metrics
// and correctly presenting them to the prometheus scraper.
func (tc *MetricsTest) TestMetricsWindows(f *framework.F) {
t := f.T()
clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad())
require.Nil(t, err)
if len(clientNodes) == 0 {
t.Skip("no Windows clients")
}
workloads := map[string]string{
"factorial_windows": "nomad_client_allocs_cpu_user",
"mem_windows": "nomad_client_allocs_memory_rss",
}
tc.runWorkloads(t, workloads)
tc.queryClientMetrics(t, clientNodes)
tc.queryAllocMetrics(t, workloads)
}
// run workloads and wait for allocations
func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) {
for jobName := range workloads {
uuid := uuid.Generate()
jobID := "metrics-" + jobName + "-" + uuid[0:8]
tc.jobIDs = append(tc.jobIDs, jobID)
file := "metrics/input/" + jobName + ".nomad"
allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID)
if len(allocs) == 0 {
t.Fatalf("failed to register %s", jobID)
}
}
}
// query prometheus to verify that metrics are being collected
// from clients
func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) {
metrics := []string{
"nomad_client_allocated_memory",
"nomad_client_host_cpu_user",
"nomad_client_host_disk_available",
"nomad_client_host_memory_used",
"nomad_client_uptime",
}
// we start with a very long timeout here because it takes a while for
// prometheus to be live and for jobs to initially register metrics.
timeout := 60 * time.Second
for _, metric := range metrics {
var results model.Vector
var err error
ok := assert.Eventually(t, func() bool {
results, err = tc.promQuery(metric)
if err != nil {
return false
}
instances := make(map[string]struct{})
for _, result := range results {
instances[string(result.Metric["node_id"])] = struct{}{}
}
// we're testing only clients for a specific OS, so we
// want to make sure we're checking for specific node_ids
// and not just equal lengths
for _, clientNode := range clientNodes {
if _, ok := instances[clientNode]; !ok {
err = fmt.Errorf("expected metric '%s' for all clients. got:\n%v",
metric, results)
return false
}
}
return true
}, timeout, 1*time.Second)
require.Truef(t, ok, "prometheus query failed (%s): %v", metric, err)
// shorten the timeout after the first workload is successfully
// queried so that we don't hang the whole test run if something's
// wrong with only one of the jobs
timeout = 15 * time.Second
}
}
// query promtheus to verify that metrics are being collected
// from allocations
func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) {
// we start with a very long timeout here because it takes a while for
// prometheus to be live and for jobs to initially register metrics.
timeout := 60 * time.Second
for jobName, metric := range workloads {
query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName)
var results model.Vector
var err error
ok := assert.Eventually(t, func() bool {
results, err = tc.promQuery(query)
if err != nil {
return false
}
// make sure we didn't just collect a bunch of zero metrics
lastResult := results[len(results)-1]
if !(float64(lastResult.Value) > 0.0) {
err = fmt.Errorf("expected non-zero metrics, got: %v", results)
return false
}
return true
}, timeout, 1*time.Second)
require.Truef(t, ok, "prometheus query failed (%s): %v", query, err)
// shorten the timeout after the first workload is successfully
// queried so that we don't hang the whole test run if something's
// wrong with only one of the jobs
timeout = 15 * time.Second
}
}