driver/docker: allow configurable pull context timeout setting.

Pulling large docker containers can take longer than the default
context timeout. Without a way to change this it is very hard for
users to utilise Nomad properly without hacky work arounds.

This change adds an optional pull_timeout config parameter which
gives operators the possibility to account for increase pull times
where needed. The infra docker image also has the option to set a
custom timeout to keep consistency.
This commit is contained in:
James Rasell 2020-08-12 08:58:07 +01:00
parent a5dc6df0ff
commit bc42cd2e5e
No known key found for this signature in database
GPG key ID: AA7D460F5C8377AA
10 changed files with 199 additions and 119 deletions

View file

@ -259,6 +259,11 @@ var (
hclspec.NewAttr("infra_image", "string", false),
hclspec.NewLiteral(`"gcr.io/google_containers/pause-amd64:3.0"`),
),
// timeout to use when pulling the infra image.
"infra_image_pull_timeout": hclspec.NewDefault(
hclspec.NewAttr("infra_image_pull_timeout", "string", false),
hclspec.NewLiteral(`"5m"`),
),
// the duration that the driver will wait for activity from the Docker engine during an image pull
// before canceling the request
@ -266,7 +271,6 @@ var (
hclspec.NewAttr("pull_activity_timeout", "string", false),
hclspec.NewLiteral(`"2m"`),
),
// disable_log_collection indicates whether docker driver should collect logs of docker
// task containers. If true, nomad doesn't start docker_logger/logmon processes
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),
@ -349,6 +353,10 @@ var (
"pid_mode": hclspec.NewAttr("pid_mode", "string", false),
"port_map": hclspec.NewAttr("port_map", "list(map(number))", false),
"privileged": hclspec.NewAttr("privileged", "bool", false),
"image_pull_timeout": hclspec.NewDefault(
hclspec.NewAttr("image_pull_timeout", "string", false),
hclspec.NewLiteral(`"5m"`),
),
"readonly_rootfs": hclspec.NewAttr("readonly_rootfs", "bool", false),
"security_opt": hclspec.NewAttr("security_opt", "list(string)", false),
"shm_size": hclspec.NewAttr("shm_size", "number", false),
@ -415,6 +423,7 @@ type TaskConfig struct {
PidMode string `codec:"pid_mode"`
PortMap hclutils.MapStrInt `codec:"port_map"`
Privileged bool `codec:"privileged"`
ImagePullTimeout string `codec:"image_pull_timeout"`
ReadonlyRootfs bool `codec:"readonly_rootfs"`
SecurityOpt []string `codec:"security_opt"`
ShmSize int64 `codec:"shm_size"`
@ -576,6 +585,8 @@ type DriverConfig struct {
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
InfraImagePullTimeout string `codec:"infra_image_pull_timeout"`
infraImagePullTimeoutDuration time.Duration `codec:"-"`
DisableLogCollection bool `codec:"disable_log_collection"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
pullActivityTimeoutDuration time.Duration `codec:"-"`
@ -667,6 +678,14 @@ func (d *Driver) SetConfig(c *base.Config) error {
d.config.pullActivityTimeoutDuration = dur
}
if d.config.InfraImagePullTimeout != "" {
dur, err := time.ParseDuration(d.config.InfraImagePullTimeout)
if err != nil {
return fmt.Errorf("failed to parse 'infra_image_pull_timeout' duaration: %v", err)
}
d.config.infraImagePullTimeoutDuration = dur
}
d.config.allowRuntimes = make(map[string]struct{}, len(d.config.AllowRuntimesList))
for _, r := range d.config.AllowRuntimesList {
d.config.allowRuntimes[r] = struct{}{}

View file

@ -25,6 +25,7 @@ func TestConfig_ParseHCL(t *testing.T) {
Devices: []DockerDevice{},
Mounts: []DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
}
@ -57,6 +58,7 @@ func TestConfig_ParseJSON(t *testing.T) {
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
{
@ -67,6 +69,7 @@ func TestConfig_ParseJSON(t *testing.T) {
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
{
@ -77,6 +80,7 @@ func TestConfig_ParseJSON(t *testing.T) {
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
{
@ -87,6 +91,7 @@ func TestConfig_ParseJSON(t *testing.T) {
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
}
@ -178,6 +183,7 @@ func TestConfig_ParseAllHCL(t *testing.T) {
cfgStr := `
config {
image = "redis:3.2"
image_pull_timeout = "15m"
advertise_ipv6_address = true
args = ["command_arg1", "command_arg2"]
auth {
@ -301,6 +307,7 @@ config {
expected := &TaskConfig{
Image: "redis:3.2",
ImagePullTimeout: "15m",
AdvertiseIPv6Addr: true,
Args: []string{"command_arg1", "command_arg2"},
Auth: DockerAuth{
@ -528,6 +535,33 @@ func TestConfig_InternalCapabilities(t *testing.T) {
}
}
func TestConfig_DriverConfig_InfraImagePullTimeout(t *testing.T) {
cases := []struct {
name string
config string
expected string
}{
{
name: "default",
config: `{}`,
expected: "5m",
},
{
name: "set explicitly",
config: `{ infra_image_pull_timeout = "1m" }`,
expected: "1m",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
var tc DriverConfig
hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc)
require.Equal(t, c.expected, tc.InfraImagePullTimeout)
})
}
}
func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) {
cases := []struct {
name string

View file

@ -131,7 +131,8 @@ func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
// PullImage is used to pull an image. It returns the pulled imaged ID or an
// error that occurred during the pull
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) {
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string,
emitFn LogEventFn, pullTimeout, pullActivityTimeout time.Duration) (imageID string, err error) {
// Get the future
d.imageLock.Lock()
future, ok := d.pullFutures[image]
@ -140,7 +141,7 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
// Make the future
future = newPullFuture()
d.pullFutures[image] = future
go d.pullImageImpl(image, authOptions, pullActivityTimeout, future)
go d.pullImageImpl(image, authOptions, pullTimeout, pullActivityTimeout, future)
}
d.imageLock.Unlock()
@ -167,11 +168,13 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
// pullImageImpl is the implementation of pulling an image. The results are
// returned via the passed future
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) {
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration,
pullTimeout, pullActivityTimeout time.Duration, future *pullFuture) {
defer d.clearPullLogger(image)
// Parse the repo and tag
repo, tag := parseDockerImage(image)
ctx, cancel := context.WithCancel(context.Background())
ctx, cancel := context.WithTimeout(context.Background(), pullTimeout)
defer cancel()
pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity,

View file

@ -73,10 +73,10 @@ func TestDockerCoordinator_ConcurrentPulls(t *testing.T) {
// Create a coordinator
coordinator := newDockerCoordinator(config)
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 2*time.Minute)
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 5*time.Minute, 2*time.Minute)
for i := 0; i < 9; i++ {
go func() {
coordinator.PullImage(image, nil, uuid.Generate(), nil, 2*time.Minute)
coordinator.PullImage(image, nil, uuid.Generate(), nil, 5*time.Minute, 2*time.Minute)
}()
}
@ -129,7 +129,7 @@ func TestDockerCoordinator_Pull_Remove(t *testing.T) {
callerIDs := make([]string, 10, 10)
for i := 0; i < 10; i++ {
callerIDs[i] = uuid.Generate()
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 2*time.Minute)
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 5*time.Minute, 2*time.Minute)
}
// Check the reference count
@ -195,7 +195,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
callerID := uuid.Generate()
// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2*time.Minute)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
@ -211,7 +211,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
}
// Pull image again within delay
id, _ = coordinator.PullImage(image, nil, callerID, nil, 2*time.Minute)
id, _ = coordinator.PullImage(image, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
@ -244,7 +244,7 @@ func TestDockerCoordinator_No_Cleanup(t *testing.T) {
callerID := uuid.Generate()
// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2*time.Minute)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 0 {
@ -283,10 +283,10 @@ func TestDockerCoordinator_Cleanup_HonorsCtx(t *testing.T) {
callerID := uuid.Generate()
// Pull image
id1, _ := coordinator.PullImage(image1ID, nil, callerID, nil, 2*time.Minute)
id1, _ := coordinator.PullImage(image1ID, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
require.Len(t, coordinator.imageRefCount[id1], 1, "image reference count")
id2, _ := coordinator.PullImage(image2ID, nil, callerID, nil, 2*time.Minute)
id2, _ := coordinator.PullImage(image2ID, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
require.Len(t, coordinator.imageRefCount[id2], 1, "image reference count")
// remove one image, cancel ctx, remove second, and assert only first image is cleanedup

View file

@ -561,7 +561,12 @@ func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c
},
})
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), d.config.pullActivityTimeoutDuration)
pullDur, err := time.ParseDuration(driverConfig.ImagePullTimeout)
if err != nil {
return "", fmt.Errorf("Failed to parse image_pull_timeout: %v", err)
}
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), pullDur, d.config.pullActivityTimeoutDuration)
}
func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn {

View file

@ -453,6 +453,7 @@ func TestDockerDriver_Start_BadPull_Recoverable(t *testing.T) {
taskCfg := TaskConfig{
Image: "127.0.0.1:32121/foo", // bad path
ImagePullTimeout: "5m",
Command: "echo",
Args: []string{
"hello",

View file

@ -715,6 +715,7 @@ func TestDockerDriver_Start_Image_HTTPS(t *testing.T) {
taskCfg := TaskConfig{
Image: "https://gcr.io/google_containers/pause:0.8.0",
ImagePullTimeout: "5m",
}
task := &drivers.TaskConfig{
ID: uuid.Generate(),
@ -747,6 +748,7 @@ func newTaskConfig(variant string, command []string) TaskConfig {
return TaskConfig{
Image: image,
ImagePullTimeout: "5m",
LoadImage: loadImage,
Command: command[0],
Args: command[1:],

View file

@ -15,6 +15,7 @@ func newTaskConfig(variant string, command []string) TaskConfig {
return TaskConfig{
Image: busyboxImageID,
ImagePullTimeout: "5m",
Command: command[0],
Args: command[1:],
}

View file

@ -27,7 +27,7 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, b
if err != nil {
d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err)
}
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.pullActivityTimeoutDuration)
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.infraImagePullTimeoutDuration, d.config.pullActivityTimeoutDuration)
if err != nil {
return nil, false, err
}

View file

@ -46,6 +46,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -63,6 +64,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -80,6 +82,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -99,6 +102,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -116,6 +120,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -135,6 +140,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -152,6 +158,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -171,6 +178,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -191,6 +199,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -210,6 +219,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -233,6 +243,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -258,6 +269,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -294,6 +306,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -325,6 +338,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
Devices: []docker.DockerDevice{},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},
@ -363,6 +377,7 @@ func TestParseHclInterface_Hcl(t *testing.T) {
},
Mounts: []docker.DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
expectedType: &docker.TaskConfig{},
},