retry transient docker errors within function

This commit is contained in:
Mahmood Ali 2019-09-13 15:25:31 -04:00
parent e8d73e3d72
commit ac329a5e07
1 changed files with 26 additions and 2 deletions

View File

@ -41,6 +41,12 @@ var (
// running operations such as waiting on containers and collect stats
waitClient *docker.Client
dockerTransientErrs = []string{
"Client.Timeout exceeded while awaiting headers",
"EOF",
"API error (500)",
}
// recoverableErrTimeouts returns a recoverable error if the error was due
// to timeouts
recoverableErrTimeouts = func(err error) error {
@ -456,6 +462,10 @@ CREATE:
// There is still a very small chance this is possible even with the
// coordinator so retry.
return nil, nstructs.NewRecoverableError(createErr, true)
} else if isDockerTransientError(createErr) && attempted < 5 {
attempted++
time.Sleep(1 * time.Second)
goto CREATE
}
return nil, recoverableErrTimeouts(createErr)
@ -474,8 +484,7 @@ START:
d.logger.Debug("failed to start container", "container_id", c.ID, "attempt", attempted+1, "error", startErr)
// If it is a 500 error it is likely we can retry and be successful
if strings.Contains(startErr.Error(), "API error (500)") {
if isDockerTransientError(startErr) {
if attempted < 5 {
attempted++
time.Sleep(1 * time.Second)
@ -1458,3 +1467,18 @@ func sliceMergeUlimit(ulimitsRaw map[string]string) ([]docker.ULimit, error) {
func (d *Driver) Shutdown() {
d.signalShutdown()
}
func isDockerTransientError(err error) bool {
if err == nil {
return false
}
errMsg := err.Error()
for _, te := range dockerTransientErrs {
if strings.Contains(errMsg, te) {
return true
}
}
return false
}