CSI: make plugin health_timeout configurable in csi_plugin stanza (#13340)
Signed-off-by: Grant Griffiths <ggriffiths@purestorage.com>
This commit is contained in:
parent
f41ea0e5dc
commit
99896da443
3
.changelog/13340.txt
Normal file
3
.changelog/13340.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvements
|
||||||
|
csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field
|
||||||
|
```
|
|
@ -1039,10 +1039,18 @@ type TaskCSIPluginConfig struct {
|
||||||
//
|
//
|
||||||
// Default is /csi.
|
// Default is /csi.
|
||||||
MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"`
|
MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"`
|
||||||
|
|
||||||
|
// HealthTimeout is the time after which the CSI plugin tasks will be killed
|
||||||
|
// if the CSI Plugin is not healthy.
|
||||||
|
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *TaskCSIPluginConfig) Canonicalize() {
|
func (t *TaskCSIPluginConfig) Canonicalize() {
|
||||||
if t.MountDir == "" {
|
if t.MountDir == "" {
|
||||||
t.MountDir = "/csi"
|
t.MountDir = "/csi"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if t.HealthTimeout == 0 {
|
||||||
|
t.HealthTimeout = 30 * time.Second
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -103,6 +103,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
|
||||||
socketMountPoint := filepath.Join(config.clientStateDirPath, "csi",
|
socketMountPoint := filepath.Join(config.clientStateDirPath, "csi",
|
||||||
"plugins", config.runner.Alloc().ID)
|
"plugins", config.runner.Alloc().ID)
|
||||||
|
|
||||||
|
if task.CSIPluginConfig.HealthTimeout == 0 {
|
||||||
|
task.CSIPluginConfig.HealthTimeout = 30 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
shutdownCtx, cancelFn := context.WithCancel(context.Background())
|
shutdownCtx, cancelFn := context.WithCancel(context.Background())
|
||||||
|
|
||||||
hook := &csiPluginSupervisorHook{
|
hook := &csiPluginSupervisorHook{
|
||||||
|
@ -253,7 +257,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
|
||||||
|
|
||||||
// We're in Poststart at this point, so if we can't connect within
|
// We're in Poststart at this point, so if we can't connect within
|
||||||
// this deadline, assume it's broken so we can restart the task
|
// this deadline, assume it's broken so we can restart the task
|
||||||
startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second)
|
startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout)
|
||||||
defer startCancelFn()
|
defer startCancelFn()
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
|
@ -441,7 +445,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) {
|
||||||
if err := h.lifecycle.Kill(ctx,
|
if err := h.lifecycle.Kill(ctx,
|
||||||
structs.NewTaskEvent(structs.TaskKilling).
|
structs.NewTaskEvent(structs.TaskKilling).
|
||||||
SetFailsTask().
|
SetFailsTask().
|
||||||
SetDisplayMessage("CSI plugin did not become healthy before timeout"),
|
SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())),
|
||||||
); err != nil {
|
); err != nil {
|
||||||
h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
|
h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1263,6 +1263,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf
|
||||||
sc.ID = apiConfig.ID
|
sc.ID = apiConfig.ID
|
||||||
sc.Type = structs.CSIPluginType(apiConfig.Type)
|
sc.Type = structs.CSIPluginType(apiConfig.Type)
|
||||||
sc.MountDir = apiConfig.MountDir
|
sc.MountDir = apiConfig.MountDir
|
||||||
|
sc.HealthTimeout = apiConfig.HealthTimeout
|
||||||
return sc
|
return sc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -158,12 +158,20 @@ func parseTask(item *ast.ObjectItem, keys []string) (*api.Task, error) {
|
||||||
i := o.Elem().Items[0]
|
i := o.Elem().Items[0]
|
||||||
|
|
||||||
var m map[string]interface{}
|
var m map[string]interface{}
|
||||||
|
var cfg api.TaskCSIPluginConfig
|
||||||
if err := hcl.DecodeObject(&m, i.Val); err != nil {
|
if err := hcl.DecodeObject(&m, i.Val); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var cfg api.TaskCSIPluginConfig
|
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
|
||||||
if err := mapstructure.WeakDecode(m, &cfg); err != nil {
|
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
|
||||||
|
WeaklyTypedInput: true,
|
||||||
|
Result: &cfg,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if err := dec.Decode(m); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -629,6 +629,7 @@ func TestParse(t *testing.T) {
|
||||||
ID: "org.hashicorp.csi",
|
ID: "org.hashicorp.csi",
|
||||||
Type: api.CSIPluginTypeMonolith,
|
Type: api.CSIPluginTypeMonolith,
|
||||||
MountDir: "/csi/test",
|
MountDir: "/csi/test",
|
||||||
|
HealthTimeout: 1 * time.Minute,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
@ -7,6 +7,7 @@ job "binstore-storagelocker" {
|
||||||
id = "org.hashicorp.csi"
|
id = "org.hashicorp.csi"
|
||||||
type = "monolith"
|
type = "monolith"
|
||||||
mount_dir = "/csi/test"
|
mount_dir = "/csi/test"
|
||||||
|
health_timeout = "1m"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,6 +67,10 @@ type TaskCSIPluginConfig struct {
|
||||||
// to be created by the plugin, and will provide references into
|
// to be created by the plugin, and will provide references into
|
||||||
// "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts.
|
// "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts.
|
||||||
MountDir string
|
MountDir string
|
||||||
|
|
||||||
|
// HealthTimeout is the time after which the CSI plugin tasks will be killed
|
||||||
|
// if the CSI Plugin is not healthy.
|
||||||
|
HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig {
|
func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig {
|
||||||
|
|
|
@ -20,6 +20,7 @@ csi_plugin {
|
||||||
id = "csi-hostpath"
|
id = "csi-hostpath"
|
||||||
type = "monolith"
|
type = "monolith"
|
||||||
mount_dir = "/csi"
|
mount_dir = "/csi"
|
||||||
|
health_timeout = "30s"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -43,6 +44,11 @@ csi_plugin {
|
||||||
container where the plugin will expect a Unix domain socket for
|
container where the plugin will expect a Unix domain socket for
|
||||||
bidirectional communication with Nomad.
|
bidirectional communication with Nomad.
|
||||||
|
|
||||||
|
- `health_timeout` `(duration: <optional>)` - The duration that
|
||||||
|
the plugin supervisor will wait before restarting an unhealthy
|
||||||
|
CSI plugin. Must be a duration value such as `30s` or `2m`.
|
||||||
|
Defaults to `30s` if not set.
|
||||||
|
|
||||||
~> **Note:** Plugins running as `node` or `monolith` require root
|
~> **Note:** Plugins running as `node` or `monolith` require root
|
||||||
privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the
|
privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the
|
||||||
host. With the Docker task driver, you can use the `privileged = true`
|
host. With the Docker task driver, you can use the `privileged = true`
|
||||||
|
@ -115,6 +121,7 @@ job "plugin-efs" {
|
||||||
type = "node"
|
type = "node"
|
||||||
mount_dir = "/csi" # this path /csi matches the --endpoint
|
mount_dir = "/csi" # this path /csi matches the --endpoint
|
||||||
# argument for the container
|
# argument for the container
|
||||||
|
health_timeout = "30s"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue