CSI: make plugin health_timeout configurable in csi_plugin stanza (#13340)

Signed-off-by: Grant Griffiths <ggriffiths@purestorage.com>
This commit is contained in:
Grant Griffiths 2022-06-14 07:04:16 -07:00 committed by GitHub
parent f41ea0e5dc
commit 99896da443
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 53 additions and 16 deletions

3
.changelog/13340.txt Normal file
View file

@ -0,0 +1,3 @@
```release-note:improvements
csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field
```

View file

@ -1039,10 +1039,18 @@ type TaskCSIPluginConfig struct {
// //
// Default is /csi. // Default is /csi.
MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"` MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"`
// HealthTimeout is the time after which the CSI plugin tasks will be killed
// if the CSI Plugin is not healthy.
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
} }
func (t *TaskCSIPluginConfig) Canonicalize() { func (t *TaskCSIPluginConfig) Canonicalize() {
if t.MountDir == "" { if t.MountDir == "" {
t.MountDir = "/csi" t.MountDir = "/csi"
} }
if t.HealthTimeout == 0 {
t.HealthTimeout = 30 * time.Second
}
} }

View file

@ -103,6 +103,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
socketMountPoint := filepath.Join(config.clientStateDirPath, "csi", socketMountPoint := filepath.Join(config.clientStateDirPath, "csi",
"plugins", config.runner.Alloc().ID) "plugins", config.runner.Alloc().ID)
if task.CSIPluginConfig.HealthTimeout == 0 {
task.CSIPluginConfig.HealthTimeout = 30 * time.Second
}
shutdownCtx, cancelFn := context.WithCancel(context.Background()) shutdownCtx, cancelFn := context.WithCancel(context.Background())
hook := &csiPluginSupervisorHook{ hook := &csiPluginSupervisorHook{
@ -253,7 +257,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
// We're in Poststart at this point, so if we can't connect within // We're in Poststart at this point, so if we can't connect within
// this deadline, assume it's broken so we can restart the task // this deadline, assume it's broken so we can restart the task
startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second) startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout)
defer startCancelFn() defer startCancelFn()
var err error var err error
@ -441,7 +445,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) {
if err := h.lifecycle.Kill(ctx, if err := h.lifecycle.Kill(ctx,
structs.NewTaskEvent(structs.TaskKilling). structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask(). SetFailsTask().
SetDisplayMessage("CSI plugin did not become healthy before timeout"), SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())),
); err != nil { ); err != nil {
h.logger.Error("failed to kill task", "kill_reason", reason, "error", err) h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
} }

View file

@ -1263,6 +1263,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf
sc.ID = apiConfig.ID sc.ID = apiConfig.ID
sc.Type = structs.CSIPluginType(apiConfig.Type) sc.Type = structs.CSIPluginType(apiConfig.Type)
sc.MountDir = apiConfig.MountDir sc.MountDir = apiConfig.MountDir
sc.HealthTimeout = apiConfig.HealthTimeout
return sc return sc
} }

View file

@ -158,12 +158,20 @@ func parseTask(item *ast.ObjectItem, keys []string) (*api.Task, error) {
i := o.Elem().Items[0] i := o.Elem().Items[0]
var m map[string]interface{} var m map[string]interface{}
var cfg api.TaskCSIPluginConfig
if err := hcl.DecodeObject(&m, i.Val); err != nil { if err := hcl.DecodeObject(&m, i.Val); err != nil {
return nil, err return nil, err
} }
var cfg api.TaskCSIPluginConfig dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
if err := mapstructure.WeakDecode(m, &cfg); err != nil { DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
WeaklyTypedInput: true,
Result: &cfg,
})
if err != nil {
return nil, err
}
if err := dec.Decode(m); err != nil {
return nil, err return nil, err
} }

View file

@ -629,6 +629,7 @@ func TestParse(t *testing.T) {
ID: "org.hashicorp.csi", ID: "org.hashicorp.csi",
Type: api.CSIPluginTypeMonolith, Type: api.CSIPluginTypeMonolith,
MountDir: "/csi/test", MountDir: "/csi/test",
HealthTimeout: 1 * time.Minute,
}, },
}, },
}, },

View file

@ -7,6 +7,7 @@ job "binstore-storagelocker" {
id = "org.hashicorp.csi" id = "org.hashicorp.csi"
type = "monolith" type = "monolith"
mount_dir = "/csi/test" mount_dir = "/csi/test"
health_timeout = "1m"
} }
} }
} }

View file

@ -67,6 +67,10 @@ type TaskCSIPluginConfig struct {
// to be created by the plugin, and will provide references into // to be created by the plugin, and will provide references into
// "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts. // "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts.
MountDir string MountDir string
// HealthTimeout is the time after which the CSI plugin tasks will be killed
// if the CSI Plugin is not healthy.
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
} }
func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig { func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig {

View file

@ -20,6 +20,7 @@ csi_plugin {
id = "csi-hostpath" id = "csi-hostpath"
type = "monolith" type = "monolith"
mount_dir = "/csi" mount_dir = "/csi"
health_timeout = "30s"
} }
``` ```
@ -43,6 +44,11 @@ csi_plugin {
container where the plugin will expect a Unix domain socket for container where the plugin will expect a Unix domain socket for
bidirectional communication with Nomad. bidirectional communication with Nomad.
- `health_timeout` `(duration: <optional>)` - The duration that
the plugin supervisor will wait before restarting an unhealthy
CSI plugin. Must be a duration value such as `30s` or `2m`.
Defaults to `30s` if not set.
~> **Note:** Plugins running as `node` or `monolith` require root ~> **Note:** Plugins running as `node` or `monolith` require root
privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the
host. With the Docker task driver, you can use the `privileged = true` host. With the Docker task driver, you can use the `privileged = true`
@ -115,6 +121,7 @@ job "plugin-efs" {
type = "node" type = "node"
mount_dir = "/csi" # this path /csi matches the --endpoint mount_dir = "/csi" # this path /csi matches the --endpoint
# argument for the container # argument for the container
health_timeout = "30s"
} }
} }
} }