connect: use exp backoff when waiting on consul envoy bootstrap

This PR wraps the use of the consul envoy bootstrap command in
an expoenential backoff closure, configured to timeout after 60
seconds. This is an increase over the current behavior of making
3 attempts over 6 seconds.

Should help with #10451
This commit is contained in:
Seth Hoenig 2021-04-26 13:24:06 -06:00
parent 2a66922daf
commit 238ac718f2
5 changed files with 336 additions and 46 deletions

View File

@ -17,6 +17,7 @@ IMPROVEMENTS:
* client/config: Enable sockaddr templating for `network-interface` attribute. [[GH-10404](https://github.com/hashicorp/nomad/issues/10404)]
* client/fingerprint: Added support multiple host network aliases for the same interface. [[GH-10104](https://github.com/hashicorp/nomad/issues/10104)]
* consul: Allow setting `body` field on service/check Consul health checks. [[GH-10186](https://github.com/hashicorp/nomad/issues/10186)]
* consul/connect: Use exponential backoff for consul envoy bootstrap process [[GH-10453](https://github.com/hashicorp/nomad/pull/10453)]
* consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)]
* consul/connect: Automatically populate `CONSUL_HTTP_ADDR` for connect native tasks in host networking mode. [[GH-10239](https://github.com/hashicorp/nomad/issues/10239)]
* csi: Added support for jobs to request a unique volume ID per allocation. [[GH-10136](https://github.com/hashicorp/nomad/issues/10136)]

View File

@ -16,6 +16,7 @@ import (
"github.com/hashicorp/nomad/client/taskenv"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/exptime"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/pkg/errors"
@ -23,6 +24,21 @@ import (
const envoyBootstrapHookName = "envoy_bootstrap"
const (
// envoyBootstrapWaitTime is the amount of time this hook should wait on Consul
// objects to be created before giving up.
envoyBootstrapWaitTime = 60 * time.Second
// envoyBootstrapInitialGap is the initial amount of time the envoy bootstrap
// retry loop will wait, exponentially increasing each iteration, not including
// jitter.
envoyBoostrapInitialGap = 1 * time.Second
// envoyBootstrapMaxJitter is the maximum amount of jitter applied to the
// wait gap each iteration of the envoy bootstrap retry loop.
envoyBootstrapMaxJitter = 500 * time.Millisecond
)
type consulTransportConfig struct {
HTTPAddr string // required
Auth string // optional, env CONSUL_HTTP_AUTH
@ -100,16 +116,32 @@ type envoyBootstrapHook struct {
// consulNamespace is the Consul namespace as set by in the job
consulNamespace string
// envoyBootstrapWaitTime is the total amount of time hook will wait for Consul
envoyBootstrapWaitTime time.Duration
// envoyBootstrapInitialGap is the initial wait gap when retyring
envoyBoostrapInitialGap time.Duration
// envoyBootstrapMaxJitter is the maximum amount of jitter applied to retries
envoyBootstrapMaxJitter time.Duration
// envoyBootstrapExpSleep controls exponential waiting
envoyBootstrapExpSleep func(time.Duration)
// logger is used to log things
logger hclog.Logger
}
func newEnvoyBootstrapHook(c *envoyBootstrapHookConfig) *envoyBootstrapHook {
return &envoyBootstrapHook{
alloc: c.alloc,
consulConfig: c.consul,
consulNamespace: c.consulNamespace,
logger: c.logger.Named(envoyBootstrapHookName),
alloc: c.alloc,
consulConfig: c.consul,
consulNamespace: c.consulNamespace,
envoyBootstrapWaitTime: envoyBootstrapWaitTime,
envoyBoostrapInitialGap: envoyBoostrapInitialGap,
envoyBootstrapMaxJitter: envoyBootstrapMaxJitter,
envoyBootstrapExpSleep: time.Sleep,
logger: c.logger.Named(envoyBootstrapHookName),
}
}
@ -221,66 +253,71 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
bootstrapArgs := bootstrap.args()
bootstrapEnv := bootstrap.env(os.Environ())
// Since Consul services are registered asynchronously with this task
// hook running, retry a small number of times with backoff.
for tries := 3; ; tries-- {
// keep track of latest error returned from exec-ing consul envoy bootstrap
var cmdErr error
// Since Consul services are registered asynchronously with this task
// hook running, retry until timeout or success.
if backoffErr := exptime.Backoff(func() (bool, error) {
// If hook is killed, just stop.
select {
case <-ctx.Done():
return false, nil
default:
}
// Prepare bootstrap command to run.
cmd := exec.CommandContext(ctx, "consul", bootstrapArgs...)
cmd.Env = bootstrapEnv
// Redirect output to secrets/envoy_bootstrap.json
fd, err := os.Create(bootstrapFilePath)
if err != nil {
return fmt.Errorf("error creating secrets/envoy_bootstrap.json for envoy: %v", err)
// Redirect stdout to secrets/envoy_bootstrap.json.
fd, fileErr := os.Create(bootstrapFilePath)
if fileErr != nil {
return false, fmt.Errorf("failed to create secrets/envoy_bootstrap.json for envoy: %w", fileErr)
}
cmd.Stdout = fd
// Redirect stderr into a buffer for later reading.
buf := bytes.NewBuffer(nil)
cmd.Stderr = buf
// Generate bootstrap
err = cmd.Run()
cmdErr = cmd.Run()
// Close bootstrap.json
fd.Close()
// Close bootstrap.json regardless of any command errors.
_ = fd.Close()
if err == nil {
// Happy path! Bootstrap was created, exit.
break
// Command succeeded, exit.
if cmdErr == nil {
// Bootstrap written. Mark as done and move on.
resp.Done = true
return false, nil
}
// Check for error from command
if tries == 0 {
h.logger.Error("error creating bootstrap configuration for Connect proxy sidecar", "error", err, "stderr", buf.String())
// Command failed, prepare for retry
//
// Cleanup the bootstrap file. An errors here is not
// important as (a) we test to ensure the deletion
// occurs, and (b) the file will either be rewritten on
// retry or eventually garbage collected if the task
// fails.
_ = os.Remove(bootstrapFilePath)
// Cleanup the bootstrap file. An errors here is not
// important as (a) we test to ensure the deletion
// occurs, and (b) the file will either be rewritten on
// retry or eventually garbage collected if the task
// fails.
os.Remove(bootstrapFilePath)
// ExitErrors are recoverable since they indicate the
// command was runnable but exited with a unsuccessful
// error code.
_, recoverable := err.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", err),
recoverable,
)
}
// Sleep before retrying to give Consul services time to register
select {
case <-time.After(2 * time.Second):
case <-ctx.Done():
// Killed before bootstrap, exit without setting Done
return nil
}
return true, cmdErr
}, exptime.BackoffOptions{
MaxSleepTime: h.envoyBootstrapWaitTime,
InitialGapSize: h.envoyBoostrapInitialGap,
MaxJitterSize: h.envoyBootstrapMaxJitter,
}); backoffErr != nil {
// Wrap the last error from Consul and set that as our status.
_, recoverable := cmdErr.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", cmdErr),
recoverable,
)
}
// Bootstrap written. Mark as done and move on.
resp.Done = true
return nil
}

View File

@ -12,6 +12,7 @@ import (
"os"
"path/filepath"
"testing"
"time"
consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/allocdir"
@ -625,6 +626,11 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{
Addr: testConsul.HTTPAddr,
}, consulNamespace, logger))
// Lower the allowable wait time for testing
h.envoyBootstrapWaitTime = 1 * time.Second
h.envoyBoostrapInitialGap = 100 * time.Millisecond
req := &interfaces.TaskPrestartRequest{
Task: sidecarTask,
TaskDir: allocDir.NewTaskDir(sidecarTask.Name),
@ -648,6 +654,99 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
require.True(t, os.IsNotExist(err))
}
func TestTaskRunner_EnvoyBootstrapHook_retryTimeout(t *testing.T) {
t.Parallel()
logger := testlog.HCLogger(t)
testConsul := getTestConsul(t)
defer testConsul.Stop()
begin := time.Now()
// Setup an Allocation
alloc := mock.ConnectAlloc()
alloc.AllocatedResources.Shared.Networks = []*structs.NetworkResource{
{
Mode: "bridge",
IP: "10.0.0.1",
DynamicPorts: []structs.Port{
{
Label: "connect-proxy-foo",
Value: 9999,
To: 9999,
},
},
},
}
tg := alloc.Job.TaskGroups[0]
tg.Services = []*structs.Service{
{
Name: "foo",
PortLabel: "9999", // Just need a valid port, nothing will bind to it
Connect: &structs.ConsulConnect{
SidecarService: &structs.ConsulSidecarService{},
},
},
}
sidecarTask := &structs.Task{
Name: "sidecar",
Kind: structs.NewTaskKind(structs.ConnectProxyPrefix, "foo"),
}
tg.Tasks = append(tg.Tasks, sidecarTask)
allocDir, cleanupAlloc := allocdir.TestAllocDir(t, logger, "EnvoyBootstrapRetryTimeout")
defer cleanupAlloc()
// Get a Consul client
consulConfig := consulapi.DefaultConfig()
consulConfig.Address = testConsul.HTTPAddr
// Do NOT register group services, causing the hook to retry until timeout
// Run Connect bootstrap hook
h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{
Addr: consulConfig.Address,
}, consulNamespace, logger))
// Keep track of the retry backoff iterations
iterations := 0
// Lower the allowable wait time for testing
h.envoyBootstrapWaitTime = 3 * time.Second
h.envoyBoostrapInitialGap = 1 * time.Second
h.envoyBootstrapExpSleep = func(d time.Duration) {
iterations++
time.Sleep(d)
}
// Create the prestart request
req := &interfaces.TaskPrestartRequest{
Task: sidecarTask,
TaskDir: allocDir.NewTaskDir(sidecarTask.Name),
TaskEnv: taskenv.NewEmptyTaskEnv(),
}
require.NoError(t, req.TaskDir.Build(false, nil))
var resp interfaces.TaskPrestartResponse
// Run the hook and get the error
err := h.Prestart(context.Background(), req, &resp)
require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1")
// Current time should be at least start time + total wait time
minimum := begin.Add(h.envoyBootstrapWaitTime)
require.True(t, time.Now().After(minimum))
// Should hit at least 2 iterations
require.Greater(t, 2, iterations)
// Make sure we captured the recoverable-ness of the error
_, ok := err.(*structs.RecoverableError)
require.True(t, ok)
// Assert the hook is not done (it failed)
require.False(t, resp.Done)
}
func TestTaskRunner_EnvoyBootstrapHook_extractNameAndKind(t *testing.T) {
t.Run("connect sidecar", func(t *testing.T) {
kind, name, err := (*envoyBootstrapHook)(nil).extractNameAndKind(

11
helper/exptime/LICENSE.md Normal file
View File

@ -0,0 +1,11 @@
Copyright (c) 2019 The Indeed Engineering Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

142
helper/exptime/backoff.go Normal file
View File

@ -0,0 +1,142 @@
// Package exptime provides a generalized exponential backoff retry implementation.
//
// This package was copied from oss.indeed.com/go/libtime/decay and modified.
package exptime
import (
"errors"
"fmt"
"math/rand"
"time"
)
var (
// ErrMaximumTimeExceeded indicates the maximum wait time has been exceeded.
ErrMaximumTimeExceeded = errors.New("maximum backoff time exceeded")
)
// A TryFunc is what gets executed between retry wait periods during execution
// of Backoff. The keepRetrying return value is used to control whether a retry
// attempt should be made. This feature is useful in manipulating control flow
// in cases where it is known a retry will not be successful.
type TryFunc func() (keepRetrying bool, err error)
// BackoffOptions allow for fine-tuning backoff behavior.
type BackoffOptions struct {
// MaxSleepTime represents the maximum amount of time
// the exponential backoff system will spend sleeping,
// accumulating the amount of time spent asleep between
// retries.
//
// The algorithm starts at an interval of InitialGapSize
// and increases exponentially (x2 each iteration) from there.
// With no jitter, a MaxSleepTime of 10 seconds and InitialGapSize
// of 1 millisecond would suggest a total of 15 attempts
// (since the very last retry truncates the sleep time to
// align exactly with MaxSleepTime).
MaxSleepTime time.Duration
// InitialGapSize sets the initial amount of time the algorithm
// will sleep before the first retry (after the first attempt).
// The actual amount of sleep time will include a random amount
// of jitter, if MaxJitterSize is non-zero.
InitialGapSize time.Duration
// MaxJitterSize limits how much randomness we may
// introduce in the duration of each retry interval.
// The purpose of introducing jitter is to mitigate the
// effect of thundering herds
MaxJitterSize time.Duration
// RandomSeed is used for generating a randomly computed
// jitter size for each retry.
RandomSeed int64
// Sleeper is used to cause the process to sleep for
// a computed amount of time. If not set, a default
// implementation based on time.Sleep will be used.
Sleeper Sleeper
}
// A Sleeper is a useful way for calling time.Sleep
// in a mock-able way for tests.
type Sleeper func(time.Duration)
// Backoff will attempt to execute function using a configurable
// exponential backoff algorithm. function is a TryFunc which requires
// two return parameters - a boolean for optimizing control flow, and
// an error for reporting failure conditions. If the first parameter is
// false, the backoff algorithm will abandon further retry attempts and
// simply return an error. Otherwise, if the returned error is non-nil, the
// backoff algorithm will sleep for an increasing amount of time, and
// then retry again later, until the maximum amount of sleep time has
// been consumed. Once function has executed successfully with no error,
// the backoff algorithm returns a nil error.
func Backoff(function TryFunc, options BackoffOptions) error {
if options.MaxSleepTime <= 0 {
panic("max sleep time must be > 0")
}
if options.InitialGapSize <= 0 {
panic("initial gap size must be > 0")
}
if options.MaxJitterSize < 0 {
panic("max jitter size must be >= 0")
}
if options.MaxJitterSize > (options.MaxSleepTime / 2) {
panic("max jitter size is way too large")
}
if options.Sleeper == nil {
options.Sleeper = time.Sleep
}
consumed := time.Duration(0)
gap := options.InitialGapSize
random := rand.New(rand.NewSource(options.RandomSeed))
for consumed < options.MaxSleepTime {
keepRetrying, err := function()
if err != nil && !keepRetrying {
return fmt.Errorf("exponential backoff instructed to stop retrying: %w", err)
}
// we can ignore keepRetrying at this point, since we know
// what to do based on err
if err == nil {
return nil // success
}
// there was an error, and function wants to keep retrying
// we will sleep, and then let the loop continue
//
// (random.Float64 returns a value [0.0, 1.0), which is used to
// randomly scale the jitter from 0 to MaxJitterSize.
jitter := nextJitter(random.Float64(), options.MaxJitterSize)
duration := gap + jitter
if (duration + consumed) > options.MaxSleepTime {
// this will be our last try, force the duration
// to line up with the maximum sleep time
duration = options.MaxSleepTime - consumed
}
// sleep for the configured duration
options.Sleeper(duration)
// account for how long we intended to sleep
consumed += duration
// exponentially increase the gap
gap *= 2
}
return ErrMaximumTimeExceeded
}
func nextJitter(fraction float64, maxSize time.Duration) time.Duration {
scaled := fraction * float64(maxSize)
return time.Duration(scaled)
}