2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2019-12-12 17:45:16 +00:00
|
|
|
package metrics
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"testing"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/hashicorp/nomad/e2e/e2eutil"
|
|
|
|
"github.com/hashicorp/nomad/e2e/framework"
|
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2021-01-26 14:24:55 +00:00
|
|
|
"github.com/hashicorp/nomad/testutil"
|
2019-12-12 17:45:16 +00:00
|
|
|
"github.com/prometheus/common/model"
|
|
|
|
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
)
|
|
|
|
|
|
|
|
type MetricsTest struct {
|
|
|
|
framework.TC
|
|
|
|
jobIDs []string
|
|
|
|
prometheusID string
|
|
|
|
fabioID string
|
|
|
|
fabioAddress string
|
|
|
|
}
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
framework.AddSuites(&framework.TestSuite{
|
|
|
|
Component: "Metrics",
|
|
|
|
CanRunLocal: true,
|
|
|
|
Cases: []framework.TestCase{
|
|
|
|
new(MetricsTest),
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-08-30 09:08:12 +00:00
|
|
|
// BeforeAll stands up Prometheus to collect metrics from all clients and
|
|
|
|
// allocs, with fabio as a system job in front of it so that we don't need to
|
|
|
|
// have prometheus use host networking.
|
2019-12-12 17:45:16 +00:00
|
|
|
func (tc *MetricsTest) BeforeAll(f *framework.F) {
|
|
|
|
t := f.T()
|
|
|
|
e2eutil.WaitForLeader(t, tc.Nomad())
|
|
|
|
e2eutil.WaitForNodesReady(t, tc.Nomad(), 1)
|
|
|
|
err := tc.setUpPrometheus(f)
|
|
|
|
require.Nil(t, err)
|
|
|
|
}
|
|
|
|
|
2021-08-30 09:08:12 +00:00
|
|
|
// AfterEach CleanS up the target jobs after each test case, but keep
|
|
|
|
// fabio/prometheus for reuse between the two test cases (Windows vs Linux).
|
2019-12-12 17:45:16 +00:00
|
|
|
func (tc *MetricsTest) AfterEach(f *framework.F) {
|
|
|
|
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
for _, jobID := range tc.jobIDs {
|
|
|
|
tc.Nomad().Jobs().Deregister(jobID, true, nil)
|
|
|
|
}
|
|
|
|
tc.jobIDs = []string{}
|
|
|
|
tc.Nomad().System().GarbageCollect()
|
|
|
|
}
|
|
|
|
|
2021-08-30 09:08:12 +00:00
|
|
|
// AfterAll cleans up fabio/prometheus.
|
2019-12-12 17:45:16 +00:00
|
|
|
func (tc *MetricsTest) AfterAll(f *framework.F) {
|
|
|
|
if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
tc.tearDownPrometheus(f)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestMetricsLinux runs a collection of jobs that exercise alloc metrics.
|
|
|
|
// Then we query prometheus to verify we're collecting client and alloc metrics
|
|
|
|
// and correctly presenting them to the prometheus scraper.
|
|
|
|
func (tc *MetricsTest) TestMetricsLinux(f *framework.F) {
|
|
|
|
t := f.T()
|
|
|
|
clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad())
|
|
|
|
require.Nil(t, err)
|
|
|
|
if len(clientNodes) == 0 {
|
|
|
|
t.Skip("no Linux clients")
|
|
|
|
}
|
|
|
|
|
|
|
|
workloads := map[string]string{
|
|
|
|
"cpustress": "nomad_client_allocs_cpu_user",
|
|
|
|
"diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats?
|
|
|
|
"helloworld": "nomad_client_allocs_cpu_allocated",
|
|
|
|
"memstress": "nomad_client_allocs_memory_usage",
|
|
|
|
"simpleweb": "nomad_client_allocs_memory_rss",
|
|
|
|
}
|
|
|
|
|
|
|
|
tc.runWorkloads(t, workloads)
|
|
|
|
tc.queryClientMetrics(t, clientNodes)
|
|
|
|
tc.queryAllocMetrics(t, workloads)
|
|
|
|
}
|
|
|
|
|
2019-12-16 13:34:17 +00:00
|
|
|
// TestMetricsWindows runs a collection of jobs that exercise alloc metrics.
|
|
|
|
// Then we query prometheus to verify we're collecting client and alloc metrics
|
|
|
|
// and correctly presenting them to the prometheus scraper.
|
|
|
|
func (tc *MetricsTest) TestMetricsWindows(f *framework.F) {
|
|
|
|
t := f.T()
|
|
|
|
clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad())
|
|
|
|
require.Nil(t, err)
|
|
|
|
if len(clientNodes) == 0 {
|
|
|
|
t.Skip("no Windows clients")
|
|
|
|
}
|
|
|
|
|
|
|
|
workloads := map[string]string{
|
|
|
|
"factorial_windows": "nomad_client_allocs_cpu_user",
|
|
|
|
"mem_windows": "nomad_client_allocs_memory_rss",
|
|
|
|
}
|
|
|
|
|
|
|
|
tc.runWorkloads(t, workloads)
|
|
|
|
tc.queryClientMetrics(t, clientNodes)
|
|
|
|
tc.queryAllocMetrics(t, workloads)
|
|
|
|
}
|
|
|
|
|
|
|
|
// run workloads and wait for allocations
|
2019-12-12 17:45:16 +00:00
|
|
|
func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) {
|
|
|
|
for jobName := range workloads {
|
|
|
|
uuid := uuid.Generate()
|
|
|
|
jobID := "metrics-" + jobName + "-" + uuid[0:8]
|
|
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
|
|
file := "metrics/input/" + jobName + ".nomad"
|
2020-01-28 22:33:59 +00:00
|
|
|
allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID, "")
|
2021-01-26 14:24:55 +00:00
|
|
|
require.NotZerof(t, allocs, "failed to register %s", jobID)
|
2019-12-12 17:45:16 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// query prometheus to verify that metrics are being collected
|
|
|
|
// from clients
|
|
|
|
func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) {
|
|
|
|
metrics := []string{
|
|
|
|
"nomad_client_allocated_memory",
|
|
|
|
"nomad_client_host_cpu_user",
|
|
|
|
"nomad_client_host_disk_available",
|
|
|
|
"nomad_client_host_memory_used",
|
|
|
|
"nomad_client_uptime",
|
|
|
|
}
|
|
|
|
// we start with a very long timeout here because it takes a while for
|
|
|
|
// prometheus to be live and for jobs to initially register metrics.
|
2021-01-26 14:24:55 +00:00
|
|
|
retries := int64(60)
|
2019-12-12 17:45:16 +00:00
|
|
|
|
|
|
|
for _, metric := range metrics {
|
2021-01-26 14:24:55 +00:00
|
|
|
|
2019-12-12 17:45:16 +00:00
|
|
|
var results model.Vector
|
|
|
|
var err error
|
2021-01-26 14:24:55 +00:00
|
|
|
|
|
|
|
testutil.WaitForResultRetries(retries, func() (bool, error) {
|
|
|
|
defer time.Sleep(time.Second)
|
|
|
|
|
2019-12-12 17:45:16 +00:00
|
|
|
results, err = tc.promQuery(metric)
|
|
|
|
if err != nil {
|
2021-01-26 14:24:55 +00:00
|
|
|
return false, err
|
2019-12-12 17:45:16 +00:00
|
|
|
}
|
2021-01-26 14:24:55 +00:00
|
|
|
|
2019-12-16 13:34:17 +00:00
|
|
|
instances := make(map[string]struct{})
|
2019-12-12 17:45:16 +00:00
|
|
|
for _, result := range results {
|
2019-12-16 13:34:17 +00:00
|
|
|
instances[string(result.Metric["node_id"])] = struct{}{}
|
2019-12-12 17:45:16 +00:00
|
|
|
}
|
2019-12-16 13:34:17 +00:00
|
|
|
// we're testing only clients for a specific OS, so we
|
|
|
|
// want to make sure we're checking for specific node_ids
|
|
|
|
// and not just equal lengths
|
|
|
|
for _, clientNode := range clientNodes {
|
|
|
|
if _, ok := instances[clientNode]; !ok {
|
2021-01-26 14:24:55 +00:00
|
|
|
return false, fmt.Errorf("expected metric '%s' for all clients. got:\n%v", metric, results)
|
2019-12-16 13:34:17 +00:00
|
|
|
}
|
2019-12-12 17:45:16 +00:00
|
|
|
}
|
2021-01-26 14:24:55 +00:00
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
2019-12-12 17:45:16 +00:00
|
|
|
|
|
|
|
// shorten the timeout after the first workload is successfully
|
|
|
|
// queried so that we don't hang the whole test run if something's
|
|
|
|
// wrong with only one of the jobs
|
2021-01-26 14:24:55 +00:00
|
|
|
retries = 15
|
2019-12-12 17:45:16 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// query promtheus to verify that metrics are being collected
|
|
|
|
// from allocations
|
|
|
|
func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) {
|
|
|
|
// we start with a very long timeout here because it takes a while for
|
|
|
|
// prometheus to be live and for jobs to initially register metrics.
|
|
|
|
timeout := 60 * time.Second
|
|
|
|
for jobName, metric := range workloads {
|
|
|
|
query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName)
|
|
|
|
var results model.Vector
|
|
|
|
var err error
|
|
|
|
ok := assert.Eventually(t, func() bool {
|
|
|
|
results, err = tc.promQuery(query)
|
|
|
|
if err != nil {
|
|
|
|
return false
|
|
|
|
}
|
2019-12-20 15:39:35 +00:00
|
|
|
|
|
|
|
// make sure we didn't just collect a bunch of zero metrics
|
|
|
|
lastResult := results[len(results)-1]
|
|
|
|
if !(float64(lastResult.Value) > 0.0) {
|
|
|
|
err = fmt.Errorf("expected non-zero metrics, got: %v", results)
|
|
|
|
return false
|
|
|
|
}
|
2019-12-12 17:45:16 +00:00
|
|
|
return true
|
|
|
|
}, timeout, 1*time.Second)
|
2019-12-20 15:39:35 +00:00
|
|
|
require.Truef(t, ok, "prometheus query failed (%s): %v", query, err)
|
2019-12-12 17:45:16 +00:00
|
|
|
|
|
|
|
// shorten the timeout after the first workload is successfully
|
|
|
|
// queried so that we don't hang the whole test run if something's
|
|
|
|
// wrong with only one of the jobs
|
2019-12-20 15:39:35 +00:00
|
|
|
timeout = 15 * time.Second
|
2019-12-12 17:45:16 +00:00
|
|
|
}
|
|
|
|
}
|