CSI: make plugin health_timeout configurable in csi_plugin stanza (#13340)

Signed-off-by: Grant Griffiths <ggriffiths@purestorage.com>
This commit is contained in:
Grant Griffiths 2022-06-14 07:04:16 -07:00 committed by GitHub
parent f41ea0e5dc
commit 99896da443
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 53 additions and 16 deletions

3
.changelog/13340.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvements
csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field
```

View File

@ -1039,10 +1039,18 @@ type TaskCSIPluginConfig struct {
//
// Default is /csi.
MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"`
// HealthTimeout is the time after which the CSI plugin tasks will be killed
// if the CSI Plugin is not healthy.
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
}
func (t *TaskCSIPluginConfig) Canonicalize() {
if t.MountDir == "" {
t.MountDir = "/csi"
}
if t.HealthTimeout == 0 {
t.HealthTimeout = 30 * time.Second
}
}

View File

@ -103,6 +103,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
socketMountPoint := filepath.Join(config.clientStateDirPath, "csi",
"plugins", config.runner.Alloc().ID)
if task.CSIPluginConfig.HealthTimeout == 0 {
task.CSIPluginConfig.HealthTimeout = 30 * time.Second
}
shutdownCtx, cancelFn := context.WithCancel(context.Background())
hook := &csiPluginSupervisorHook{
@ -253,7 +257,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
// We're in Poststart at this point, so if we can't connect within
// this deadline, assume it's broken so we can restart the task
startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second)
startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout)
defer startCancelFn()
var err error
@ -441,7 +445,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) {
if err := h.lifecycle.Kill(ctx,
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage("CSI plugin did not become healthy before timeout"),
SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())),
); err != nil {
h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
}

View File

@ -1263,6 +1263,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf
sc.ID = apiConfig.ID
sc.Type = structs.CSIPluginType(apiConfig.Type)
sc.MountDir = apiConfig.MountDir
sc.HealthTimeout = apiConfig.HealthTimeout
return sc
}

View File

@ -158,12 +158,20 @@ func parseTask(item *ast.ObjectItem, keys []string) (*api.Task, error) {
i := o.Elem().Items[0]
var m map[string]interface{}
var cfg api.TaskCSIPluginConfig
if err := hcl.DecodeObject(&m, i.Val); err != nil {
return nil, err
}
var cfg api.TaskCSIPluginConfig
if err := mapstructure.WeakDecode(m, &cfg); err != nil {
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
WeaklyTypedInput: true,
Result: &cfg,
})
if err != nil {
return nil, err
}
if err := dec.Decode(m); err != nil {
return nil, err
}

View File

@ -626,9 +626,10 @@ func TestParse(t *testing.T) {
Name: "binstore",
Driver: "docker",
CSIPluginConfig: &api.TaskCSIPluginConfig{
ID: "org.hashicorp.csi",
Type: api.CSIPluginTypeMonolith,
MountDir: "/csi/test",
ID: "org.hashicorp.csi",
Type: api.CSIPluginTypeMonolith,
MountDir: "/csi/test",
HealthTimeout: 1 * time.Minute,
},
},
},

View File

@ -4,9 +4,10 @@ job "binstore-storagelocker" {
driver = "docker"
csi_plugin {
id = "org.hashicorp.csi"
type = "monolith"
mount_dir = "/csi/test"
id = "org.hashicorp.csi"
type = "monolith"
mount_dir = "/csi/test"
health_timeout = "1m"
}
}
}

View File

@ -67,6 +67,10 @@ type TaskCSIPluginConfig struct {
// to be created by the plugin, and will provide references into
// "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts.
MountDir string
// HealthTimeout is the time after which the CSI plugin tasks will be killed
// if the CSI Plugin is not healthy.
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
}
func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig {

View File

@ -17,9 +17,10 @@ to claim [volumes][csi_volumes].
```hcl
csi_plugin {
id = "csi-hostpath"
type = "monolith"
mount_dir = "/csi"
id = "csi-hostpath"
type = "monolith"
mount_dir = "/csi"
health_timeout = "30s"
}
```
@ -43,6 +44,11 @@ csi_plugin {
container where the plugin will expect a Unix domain socket for
bidirectional communication with Nomad.
- `health_timeout` `(duration: <optional>)` - The duration that
the plugin supervisor will wait before restarting an unhealthy
CSI plugin. Must be a duration value such as `30s` or `2m`.
Defaults to `30s` if not set.
~> **Note:** Plugins running as `node` or `monolith` require root
privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the
host. With the Docker task driver, you can use the `privileged = true`
@ -111,10 +117,11 @@ job "plugin-efs" {
}
csi_plugin {
id = "aws-efs0"
type = "node"
mount_dir = "/csi" # this path /csi matches the --endpoint
id = "aws-efs0"
type = "node"
mount_dir = "/csi" # this path /csi matches the --endpoint
# argument for the container
health_timeout = "30s"
}
}
}