open-nomad/client/allocrunner/taskrunner/stats_hook_test.go
Mahmood Ali 00be4fc63c
tests: deflake TestTaskRunner_StatsHook_Periodic (#9734)
This PR deflakes TestTaskRunner_StatsHook_Periodic tests and adds backoff when the driver closes the channel.

TestTaskRunner_StatsHook_Periodic is currently the most flaky test - failing ~4% of the time (20 out of 486 workflows). A sample failure: https://app.circleci.com/pipelines/github/hashicorp/nomad/14028/workflows/957b674f-cbcc-4228-96d9-1094fdee5b9c/jobs/128563 .

This change has two components:

First, it updates the StatsHook so that it backs off when stats channel is closed. In the context of the test where the mock driver emits a single stats update and closes the channel, the test may make tens of thousands update during the period. In real context, if a driver doesn't implement the stats handler properly or when a task finishes, we may generate way too many Stats queries in a tight loop. Here, the backoff reduces these queries. I've added a failing test that shows 154,458 stats updates within 500ms in https://app.circleci.com/pipelines/github/hashicorp/nomad/14092/workflows/50672445-392d-4661-b19e-e3561ed32746/jobs/129423 .

Second, the test ignores the first stats update after a task exit. Due to the asynchronicity of updates and channel/context use, it's possible that an update is enqueued while the test marks the task as exited, resulting into a spurious update.
2021-01-06 16:03:00 -05:00

238 lines
6.1 KiB
Go

package taskrunner
import (
"context"
"sync/atomic"
"testing"
"time"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/stretchr/testify/require"
)
// Statically assert the stats hook implements the expected interfaces
var _ interfaces.TaskPoststartHook = (*statsHook)(nil)
var _ interfaces.TaskExitedHook = (*statsHook)(nil)
var _ interfaces.ShutdownHook = (*statsHook)(nil)
type mockStatsUpdater struct {
// Ch is sent task resource usage updates if not nil
Ch chan *cstructs.TaskResourceUsage
}
// newMockStatsUpdater returns a mockStatsUpdater that blocks on Ch for every
// call to UpdateStats
func newMockStatsUpdater() *mockStatsUpdater {
return &mockStatsUpdater{
Ch: make(chan *cstructs.TaskResourceUsage),
}
}
func (m *mockStatsUpdater) UpdateStats(ru *cstructs.TaskResourceUsage) {
if m.Ch != nil {
m.Ch <- ru
}
}
type mockDriverStats struct {
called uint32
// err is returned by Stats if it is non-nil
err error
}
func (m *mockDriverStats) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
atomic.AddUint32(&m.called, 1)
if m.err != nil {
return nil, m.err
}
ru := &cstructs.TaskResourceUsage{
ResourceUsage: &cstructs.ResourceUsage{
MemoryStats: &cstructs.MemoryStats{
RSS: 1,
Measured: []string{"RSS"},
},
CpuStats: &cstructs.CpuStats{
SystemMode: 1,
Measured: []string{"System Mode"},
},
},
Timestamp: time.Now().UnixNano(),
Pids: map[string]*cstructs.ResourceUsage{},
}
ru.Pids["task"] = ru.ResourceUsage
ch := make(chan *cstructs.TaskResourceUsage)
go func() {
defer close(ch)
select {
case <-ctx.Done():
case ch <- ru:
}
}()
return ch, nil
}
func (m *mockDriverStats) Called() int {
return int(atomic.LoadUint32(&m.called))
}
// TestTaskRunner_StatsHook_PoststartExited asserts the stats hook starts and
// stops.
func TestTaskRunner_StatsHook_PoststartExited(t *testing.T) {
t.Parallel()
require := require.New(t)
logger := testlog.HCLogger(t)
su := newMockStatsUpdater()
ds := new(mockDriverStats)
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
// Create hook
h := newStatsHook(su, time.Minute, logger)
// Always call Exited to cleanup goroutines
defer h.Exited(context.Background(), nil, nil)
// Run prestart
require.NoError(h.Poststart(context.Background(), poststartReq, nil))
// An initial stats collection should run and call the updater
select {
case ru := <-su.Ch:
require.Equal(uint64(1), ru.ResourceUsage.MemoryStats.RSS)
case <-time.After(10 * time.Second):
t.Fatalf("timeout waiting for initial stats collection")
}
require.NoError(h.Exited(context.Background(), nil, nil))
}
// TestTaskRunner_StatsHook_Periodic asserts the stats hook collects stats on
// an interval.
func TestTaskRunner_StatsHook_Periodic(t *testing.T) {
t.Parallel()
require := require.New(t)
logger := testlog.HCLogger(t)
su := newMockStatsUpdater()
ds := new(mockDriverStats)
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
// interval needs to be high enough that even on a slow/busy VM
// Exited() can complete within the interval.
const interval = 500 * time.Millisecond
h := newStatsHook(su, interval, logger)
defer h.Exited(context.Background(), nil, nil)
// Run prestart
require.NoError(h.Poststart(context.Background(), poststartReq, nil))
// An initial stats collection should run and call the updater
var firstrun int64
select {
case ru := <-su.Ch:
if ru.Timestamp <= 0 {
t.Fatalf("expected nonzero timestamp (%v)", ru.Timestamp)
}
firstrun = ru.Timestamp
case <-time.After(10 * time.Second):
t.Fatalf("timeout waiting for initial stats collection")
}
// Should get another update in ~500ms (see interval above)
select {
case ru := <-su.Ch:
if ru.Timestamp <= firstrun {
t.Fatalf("expected timestamp (%v) after first run (%v)", ru.Timestamp, firstrun)
}
case <-time.After(10 * time.Second):
t.Fatalf("timeout waiting for second stats collection")
}
// Exiting should prevent further updates
require.NoError(h.Exited(context.Background(), nil, nil))
// Should *not* get another update in ~500ms (see interval above)
// we may get a single update due to race with exit
timeout := time.After(2 * interval)
firstUpdate := true
WAITING:
select {
case ru := <-su.Ch:
if firstUpdate {
firstUpdate = false
goto WAITING
}
t.Fatalf("unexpected update after exit (firstrun=%v; update=%v", firstrun, ru.Timestamp)
case <-timeout:
// Ok! No update after exit as expected.
}
}
// TestTaskRunner_StatsHook_NotImplemented asserts the stats hook stops if the
// driver returns NotImplemented.
func TestTaskRunner_StatsHook_NotImplemented(t *testing.T) {
t.Parallel()
require := require.New(t)
logger := testlog.HCLogger(t)
su := newMockStatsUpdater()
ds := &mockDriverStats{
err: cstructs.DriverStatsNotImplemented,
}
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
h := newStatsHook(su, 1, logger)
defer h.Exited(context.Background(), nil, nil)
// Run prestart
require.NoError(h.Poststart(context.Background(), poststartReq, nil))
// An initial stats collection should run and *not* call the updater
select {
case ru := <-su.Ch:
t.Fatalf("unexpected resource update (timestamp=%v)", ru.Timestamp)
case <-time.After(500 * time.Millisecond):
// Ok! No update received because error was returned
}
}
// TestTaskRunner_StatsHook_Backoff asserts that stats hook does some backoff
// even if the driver doesn't support intervals well
func TestTaskRunner_StatsHook_Backoff(t *testing.T) {
t.Parallel()
logger := testlog.HCLogger(t)
su := newMockStatsUpdater()
ds := &mockDriverStats{}
poststartReq := &interfaces.TaskPoststartRequest{DriverStats: ds}
h := newStatsHook(su, time.Minute, logger)
defer h.Exited(context.Background(), nil, nil)
// Run prestart
require.NoError(t, h.Poststart(context.Background(), poststartReq, nil))
timeout := time.After(500 * time.Millisecond)
DRAIN:
for {
select {
case <-su.Ch:
case <-timeout:
break DRAIN
}
}
require.Equal(t, ds.Called(), 1)
}