CSI: make plugin health_timeout configurable in csi_plugin stanza (#13340)
Signed-off-by: Grant Griffiths <ggriffiths@purestorage.com>
This commit is contained in:
parent
f41ea0e5dc
commit
99896da443
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvements
|
||||
csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field
|
||||
```
|
|
@ -1039,10 +1039,18 @@ type TaskCSIPluginConfig struct {
|
|||
//
|
||||
// Default is /csi.
|
||||
MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"`
|
||||
|
||||
// HealthTimeout is the time after which the CSI plugin tasks will be killed
|
||||
// if the CSI Plugin is not healthy.
|
||||
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
|
||||
}
|
||||
|
||||
func (t *TaskCSIPluginConfig) Canonicalize() {
|
||||
if t.MountDir == "" {
|
||||
t.MountDir = "/csi"
|
||||
}
|
||||
|
||||
if t.HealthTimeout == 0 {
|
||||
t.HealthTimeout = 30 * time.Second
|
||||
}
|
||||
}
|
||||
|
|
|
@ -103,6 +103,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
|
|||
socketMountPoint := filepath.Join(config.clientStateDirPath, "csi",
|
||||
"plugins", config.runner.Alloc().ID)
|
||||
|
||||
if task.CSIPluginConfig.HealthTimeout == 0 {
|
||||
task.CSIPluginConfig.HealthTimeout = 30 * time.Second
|
||||
}
|
||||
|
||||
shutdownCtx, cancelFn := context.WithCancel(context.Background())
|
||||
|
||||
hook := &csiPluginSupervisorHook{
|
||||
|
@ -253,7 +257,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
|
|||
|
||||
// We're in Poststart at this point, so if we can't connect within
|
||||
// this deadline, assume it's broken so we can restart the task
|
||||
startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second)
|
||||
startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout)
|
||||
defer startCancelFn()
|
||||
|
||||
var err error
|
||||
|
@ -441,7 +445,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) {
|
|||
if err := h.lifecycle.Kill(ctx,
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage("CSI plugin did not become healthy before timeout"),
|
||||
SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())),
|
||||
); err != nil {
|
||||
h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
|
||||
}
|
||||
|
|
|
@ -1263,6 +1263,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf
|
|||
sc.ID = apiConfig.ID
|
||||
sc.Type = structs.CSIPluginType(apiConfig.Type)
|
||||
sc.MountDir = apiConfig.MountDir
|
||||
sc.HealthTimeout = apiConfig.HealthTimeout
|
||||
return sc
|
||||
}
|
||||
|
||||
|
|
|
@ -158,12 +158,20 @@ func parseTask(item *ast.ObjectItem, keys []string) (*api.Task, error) {
|
|||
i := o.Elem().Items[0]
|
||||
|
||||
var m map[string]interface{}
|
||||
var cfg api.TaskCSIPluginConfig
|
||||
if err := hcl.DecodeObject(&m, i.Val); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var cfg api.TaskCSIPluginConfig
|
||||
if err := mapstructure.WeakDecode(m, &cfg); err != nil {
|
||||
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
|
||||
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
|
||||
WeaklyTypedInput: true,
|
||||
Result: &cfg,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := dec.Decode(m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
|
@ -626,9 +626,10 @@ func TestParse(t *testing.T) {
|
|||
Name: "binstore",
|
||||
Driver: "docker",
|
||||
CSIPluginConfig: &api.TaskCSIPluginConfig{
|
||||
ID: "org.hashicorp.csi",
|
||||
Type: api.CSIPluginTypeMonolith,
|
||||
MountDir: "/csi/test",
|
||||
ID: "org.hashicorp.csi",
|
||||
Type: api.CSIPluginTypeMonolith,
|
||||
MountDir: "/csi/test",
|
||||
HealthTimeout: 1 * time.Minute,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
|
|
@ -4,9 +4,10 @@ job "binstore-storagelocker" {
|
|||
driver = "docker"
|
||||
|
||||
csi_plugin {
|
||||
id = "org.hashicorp.csi"
|
||||
type = "monolith"
|
||||
mount_dir = "/csi/test"
|
||||
id = "org.hashicorp.csi"
|
||||
type = "monolith"
|
||||
mount_dir = "/csi/test"
|
||||
health_timeout = "1m"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,6 +67,10 @@ type TaskCSIPluginConfig struct {
|
|||
// to be created by the plugin, and will provide references into
|
||||
// "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts.
|
||||
MountDir string
|
||||
|
||||
// HealthTimeout is the time after which the CSI plugin tasks will be killed
|
||||
// if the CSI Plugin is not healthy.
|
||||
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
|
||||
}
|
||||
|
||||
func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig {
|
||||
|
|
|
@ -17,9 +17,10 @@ to claim [volumes][csi_volumes].
|
|||
|
||||
```hcl
|
||||
csi_plugin {
|
||||
id = "csi-hostpath"
|
||||
type = "monolith"
|
||||
mount_dir = "/csi"
|
||||
id = "csi-hostpath"
|
||||
type = "monolith"
|
||||
mount_dir = "/csi"
|
||||
health_timeout = "30s"
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -43,6 +44,11 @@ csi_plugin {
|
|||
container where the plugin will expect a Unix domain socket for
|
||||
bidirectional communication with Nomad.
|
||||
|
||||
- `health_timeout` `(duration: <optional>)` - The duration that
|
||||
the plugin supervisor will wait before restarting an unhealthy
|
||||
CSI plugin. Must be a duration value such as `30s` or `2m`.
|
||||
Defaults to `30s` if not set.
|
||||
|
||||
~> **Note:** Plugins running as `node` or `monolith` require root
|
||||
privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the
|
||||
host. With the Docker task driver, you can use the `privileged = true`
|
||||
|
@ -111,10 +117,11 @@ job "plugin-efs" {
|
|||
}
|
||||
|
||||
csi_plugin {
|
||||
id = "aws-efs0"
|
||||
type = "node"
|
||||
mount_dir = "/csi" # this path /csi matches the --endpoint
|
||||
id = "aws-efs0"
|
||||
type = "node"
|
||||
mount_dir = "/csi" # this path /csi matches the --endpoint
|
||||
# argument for the container
|
||||
health_timeout = "30s"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue