client: recover from getter panics (#14696)

The artifact getter uses the go-getter library to fetch files from
different sources. Any bug in this library that results in a panic can
cause the entire Nomad client to crash due to a single file download
attempt.

This change aims to guard against this types of crashes by recovering
from panics when the getter attempts to download an artifact. The
resulting panic is converted to an error that is stored as a task event
for operator visibility and the panic stack trace is logged to the
client's log.
This commit is contained in:
Luiz Aoqui 2022-09-26 15:16:26 -04:00 committed by GitHub
parent b554f9344a
commit 5c100c0d3d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 49 additions and 5 deletions

3
.changelog/14696.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:security
client: recover from panics caused by artifact download to prevent the Nomad client from crashing
```

View File

@ -5,10 +5,12 @@ import (
"fmt"
"net/http"
"net/url"
"runtime/debug"
"strings"
"github.com/hashicorp/go-cleanhttp"
gg "github.com/hashicorp/go-getter"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/interfaces"
@ -22,6 +24,8 @@ const (
// Getter wraps go-getter calls in an artifact configuration.
type Getter struct {
logger hclog.Logger
// httpClient is a shared HTTP client for use across all http/https
// Getter instantiations. The HTTP client is designed to be
// thread-safe, and using a pooled transport will help reduce excessive
@ -32,8 +36,9 @@ type Getter struct {
// NewGetter returns a new Getter instance. This function is called once per
// client and shared across alloc and task runners.
func NewGetter(config *config.ArtifactConfig) *Getter {
func NewGetter(logger hclog.Logger, config *config.ArtifactConfig) *Getter {
return &Getter{
logger: logger,
httpClient: &http.Client{
Transport: cleanhttp.DefaultPooledTransport(),
},
@ -42,7 +47,19 @@ func NewGetter(config *config.ArtifactConfig) *Getter {
}
// GetArtifact downloads an artifact into the specified task directory.
func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) error {
func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) (returnErr error) {
// Recover from panics to avoid crashing the entire Nomad client due to
// artifact download failures, such as bugs in go-getter.
defer func() {
if r := recover(); r != nil {
g.logger.Error("panic while downloading artifact",
"artifact", artifact.GetterSource,
"error", r,
"stack", string(debug.Stack()))
returnErr = fmt.Errorf("getter panic: %v", r)
}
}()
ggURL, err := getGetterUrl(taskEnv, artifact)
if err != nil {
return newGetError(artifact.GetterSource, err, false)

View File

@ -16,6 +16,7 @@ import (
"time"
gg "github.com/hashicorp/go-getter"
"github.com/hashicorp/go-hclog"
clientconfig "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/interfaces"
"github.com/hashicorp/nomad/client/taskenv"
@ -56,6 +57,19 @@ func noopTaskEnv(taskDir string) interfaces.EnvReplacer {
}
}
// panicReplacer is a version of taskenv.TaskEnv.ReplaceEnv that panics.
type panicReplacer struct{}
func (panicReplacer) ReplaceEnv(_ string) string {
panic("panic")
}
func (panicReplacer) ClientPath(_ string, _ bool) (string, bool) {
panic("panic")
}
func panicTaskEnv() interfaces.EnvReplacer {
return panicReplacer{}
}
// upperReplacer is a version of taskenv.TaskEnv.ReplaceEnv that upper-cases
// the given input.
type upperReplacer struct {
@ -72,7 +86,7 @@ func (u upperReplacer) ClientPath(p string, join bool) (string, bool) {
}
func TestGetter_getClient(t *testing.T) {
getter := NewGetter(&clientconfig.ArtifactConfig{
getter := NewGetter(hclog.NewNullLogger(), &clientconfig.ArtifactConfig{
HTTPReadTimeout: time.Minute,
HTTPMaxBytes: 100_000,
GCSTimeout: 1 * time.Minute,
@ -436,6 +450,15 @@ func TestGetArtifact_Setuid(t *testing.T) {
}
}
// TestGetArtifact_handlePanic tests that a panic during the getter execution
// does not cause its goroutine to crash.
func TestGetArtifact_handlePanic(t *testing.T) {
getter := TestDefaultGetter(t)
err := getter.GetArtifact(panicTaskEnv(), &structs.TaskArtifact{})
require.Error(t, err)
require.Contains(t, err.Error(), "panic")
}
func TestGetGetterUrl_Queries(t *testing.T) {
cases := []struct {
name string

View File

@ -6,6 +6,7 @@ package getter
import (
"testing"
"github.com/hashicorp/go-hclog"
clientconfig "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/stretchr/testify/require"
@ -14,5 +15,5 @@ import (
func TestDefaultGetter(t *testing.T) *Getter {
getterConf, err := clientconfig.ArtifactConfigFromAgent(config.DefaultArtifactConfig())
require.NoError(t, err)
return NewGetter(getterConf)
return NewGetter(hclog.NewNullLogger(), getterConf)
}

View File

@ -396,7 +396,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie
serversContactedCh: make(chan struct{}),
serversContactedOnce: sync.Once{},
cpusetManager: cgutil.CreateCPUSetManager(cfg.CgroupParent, cfg.ReservableCores, logger),
getter: getter.NewGetter(cfg.Artifact),
getter: getter.NewGetter(logger.Named("artifact_getter"), cfg.Artifact),
EnterpriseClient: newEnterpriseClient(logger),
}