Merge pull request #4953 from hashicorp/b-script-context-wrapper

consul: add ScriptExecutor context wrapper
This commit is contained in:
Michael Schurter 2018-12-10 17:22:53 -08:00 committed by GitHub
commit 8808ab9cea
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 95 additions and 24 deletions

View file

@ -18,6 +18,54 @@ type heartbeater interface {
UpdateTTL(id, output, status string) error UpdateTTL(id, output, status string) error
} }
// contextExec allows canceling a ScriptExecutor with a context.
type contextExec struct {
// pctx is the parent context. A subcontext will be created with Exec's
// timeout.
pctx context.Context
// exec to be wrapped in a context
exec interfaces.ScriptExecutor
}
func newContextExec(ctx context.Context, exec interfaces.ScriptExecutor) *contextExec {
return &contextExec{
pctx: ctx,
exec: exec,
}
}
type execResult struct {
buf []byte
code int
err error
}
// Exec a command until the timeout expires, the context is canceled, or the
// underlying Exec returns.
func (c *contextExec) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
resCh := make(chan execResult, 1)
// Don't trust the underlying implementation to obey timeout
ctx, cancel := context.WithTimeout(c.pctx, timeout)
defer cancel()
go func() {
output, code, err := c.exec.Exec(timeout, cmd, args)
select {
case resCh <- execResult{output, code, err}:
case <-ctx.Done():
}
}()
select {
case res := <-resCh:
return res.buf, res.code, res.err
case <-ctx.Done():
return nil, 0, ctx.Err()
}
}
// scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and // scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and
// waiting for it to shutdown. // waiting for it to shutdown.
type scriptHandle struct { type scriptHandle struct {
@ -74,6 +122,11 @@ func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceChe
func (s *scriptCheck) run() *scriptHandle { func (s *scriptCheck) run() *scriptHandle {
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
exitCh := make(chan struct{}) exitCh := make(chan struct{})
// Wrap the original ScriptExecutor in one that obeys context
// cancelation.
ctxExec := newContextExec(ctx, s.exec)
go func() { go func() {
defer close(exitCh) defer close(exitCh)
timer := time.NewTimer(0) timer := time.NewTimer(0)
@ -93,7 +146,7 @@ func (s *scriptCheck) run() *scriptHandle {
metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1) metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1)
// Execute check script with timeout // Execute check script with timeout
output, code, err := s.exec.Exec(s.check.Timeout, s.check.Command, s.check.Args) output, code, err := ctxExec.Exec(s.check.Timeout, s.check.Command, s.check.Args)
switch err { switch err {
case context.Canceled: case context.Canceled:
// check removed during execution; exit // check removed during execution; exit

View file

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"os" "os"
"os/exec" "os/exec"
"sync/atomic"
"testing" "testing"
"time" "time"
@ -23,20 +24,34 @@ func TestMain(m *testing.M) {
// blockingScriptExec implements ScriptExec by running a subcommand that never // blockingScriptExec implements ScriptExec by running a subcommand that never
// exits. // exits.
type blockingScriptExec struct { type blockingScriptExec struct {
// pctx is canceled *only* for test cleanup. Just like real
// ScriptExecutors its Exec method cannot be canceled directly -- only
// with a timeout.
pctx context.Context
// running is ticked before blocking to allow synchronizing operations // running is ticked before blocking to allow synchronizing operations
running chan struct{} running chan struct{}
// set to true if Exec is called and has exited // set to 1 with atomics if Exec is called and has exited
exited bool exited int32
} }
func newBlockingScriptExec() *blockingScriptExec { // newBlockingScriptExec returns a ScriptExecutor that blocks Exec() until the
return &blockingScriptExec{running: make(chan struct{})} // caller recvs on the b.running chan. It also returns a CancelFunc for test
// cleanup only. The runtime cannot cancel ScriptExecutors before their timeout
// expires.
func newBlockingScriptExec() (*blockingScriptExec, context.CancelFunc) {
ctx, cancel := context.WithCancel(context.Background())
exec := &blockingScriptExec{
pctx: ctx,
running: make(chan struct{}),
}
return exec, cancel
} }
func (b *blockingScriptExec) Exec(dur time.Duration, _ string, _ []string) ([]byte, int, error) { func (b *blockingScriptExec) Exec(dur time.Duration, _ string, _ []string) ([]byte, int, error) {
b.running <- struct{}{} b.running <- struct{}{}
ctx, cancel := context.WithTimeout(context.Background(), dur) ctx, cancel := context.WithTimeout(b.pctx, dur)
defer cancel() defer cancel()
cmd := exec.CommandContext(ctx, testtask.Path(), "sleep", "9000h") cmd := exec.CommandContext(ctx, testtask.Path(), "sleep", "9000h")
testtask.SetCmdEnv(cmd) testtask.SetCmdEnv(cmd)
@ -47,23 +62,20 @@ func (b *blockingScriptExec) Exec(dur time.Duration, _ string, _ []string) ([]by
code = 1 code = 1
} }
} }
b.exited = true atomic.StoreInt32(&b.exited, 1)
return []byte{}, code, err return []byte{}, code, err
} }
// TestConsulScript_Exec_Cancel asserts cancelling a script check shortcircuits // TestConsulScript_Exec_Cancel asserts cancelling a script check shortcircuits
// any running scripts. // any running scripts.
func TestConsulScript_Exec_Cancel(t *testing.T) { func TestConsulScript_Exec_Cancel(t *testing.T) {
// FIXME: This test is failing now as check process cancellation
// doesn't get propogated to the script check causing timeouts
t.Skip("FIXME: unexpected failing test")
serviceCheck := structs.ServiceCheck{ serviceCheck := structs.ServiceCheck{
Name: "sleeper", Name: "sleeper",
Interval: time.Hour, Interval: time.Hour,
Timeout: time.Hour, Timeout: time.Hour,
} }
exec := newBlockingScriptExec() exec, cancel := newBlockingScriptExec()
defer cancel()
// pass nil for heartbeater as it shouldn't be called // pass nil for heartbeater as it shouldn't be called
check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, nil, testlog.HCLogger(t), nil) check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, nil, testlog.HCLogger(t), nil)
@ -80,8 +92,11 @@ func TestConsulScript_Exec_Cancel(t *testing.T) {
case <-time.After(3 * time.Second): case <-time.After(3 * time.Second):
t.Fatalf("timed out waiting for script check to exit") t.Fatalf("timed out waiting for script check to exit")
} }
if !exec.exited {
t.Errorf("expected script executor to run and exit but it has not") // The underlying ScriptExecutor (newBlockScriptExec) *cannot* be
// canceled. Only a wrapper around it obeys the context cancelation.
if atomic.LoadInt32(&exec.exited) == 1 {
t.Errorf("expected script executor to still be running after timeout")
} }
} }
@ -106,16 +121,19 @@ func newFakeHeartbeater() *fakeHeartbeater {
return &fakeHeartbeater{updates: make(chan execStatus)} return &fakeHeartbeater{updates: make(chan execStatus)}
} }
// TestConsulScript_Exec_Timeout asserts a script will be killed when the // TestConsulScript_Exec_TimeoutBasic asserts a script will be killed when the
// timeout is reached. // timeout is reached.
func TestConsulScript_Exec_Timeout(t *testing.T) { func TestConsulScript_Exec_TimeoutBasic(t *testing.T) {
t.Parallel() // run the slow tests in parallel t.Parallel()
serviceCheck := structs.ServiceCheck{ serviceCheck := structs.ServiceCheck{
Name: "sleeper", Name: "sleeper",
Interval: time.Hour, Interval: time.Hour,
Timeout: time.Second, Timeout: time.Second,
} }
exec := newBlockingScriptExec()
exec, cancel := newBlockingScriptExec()
defer cancel()
hb := newFakeHeartbeater() hb := newFakeHeartbeater()
check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.HCLogger(t), nil) check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.HCLogger(t), nil)
@ -132,8 +150,11 @@ func TestConsulScript_Exec_Timeout(t *testing.T) {
case <-time.After(3 * time.Second): case <-time.After(3 * time.Second):
t.Fatalf("timed out waiting for script check to exit") t.Fatalf("timed out waiting for script check to exit")
} }
if !exec.exited {
t.Errorf("expected script executor to run and exit but it has not") // The underlying ScriptExecutor (newBlockScriptExec) *cannot* be
// canceled. Only a wrapper around it obeys the context cancelation.
if atomic.LoadInt32(&exec.exited) == 1 {
t.Errorf("expected script executor to still be running after timeout")
} }
// Cancel and watch for exit // Cancel and watch for exit
@ -160,11 +181,8 @@ func (sleeperExec) Exec(time.Duration, string, []string) ([]byte, int, error) {
// the timeout is reached and always set a critical status regardless of what // the timeout is reached and always set a critical status regardless of what
// Exec returns. // Exec returns.
func TestConsulScript_Exec_TimeoutCritical(t *testing.T) { func TestConsulScript_Exec_TimeoutCritical(t *testing.T) {
// FIXME: This test is failing now because we no longer mark critical t.Parallel()
// if check succeeded
t.Skip("FIXME: unexpected failing test")
t.Parallel() // run the slow tests in parallel
serviceCheck := structs.ServiceCheck{ serviceCheck := structs.ServiceCheck{
Name: "sleeper", Name: "sleeper",
Interval: time.Hour, Interval: time.Hour,