557a6b4a5e
This PR fixes a bug where the docker network pause container would not be stopped and removed in the case where a node is restarted, the alloc is moved to another node, the node comes back up. See the issue below for full repro conditions. Basically in the DestroyNetwork PostRun hook we would depend on the NetworkIsolationSpec field not being nil - which is only the case if the Client stays alive all the way from network creation to network teardown. If the node is rebooted we lose that state and previously would not be able to find the pause container to remove. Now, we manually find the pause container by scanning them and looking for the associated allocID. Fixes #17299
509 lines
14 KiB
Go
509 lines
14 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package drivers
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"io"
|
|
"time"
|
|
|
|
"github.com/LK4D4/joincontext"
|
|
"github.com/golang/protobuf/ptypes"
|
|
hclog "github.com/hashicorp/go-hclog"
|
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
|
"github.com/hashicorp/nomad/helper/pluginutils/grpcutils"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/plugins/base"
|
|
"github.com/hashicorp/nomad/plugins/drivers/proto"
|
|
"github.com/hashicorp/nomad/plugins/shared/hclspec"
|
|
pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
|
sproto "github.com/hashicorp/nomad/plugins/shared/structs/proto"
|
|
"google.golang.org/grpc/status"
|
|
)
|
|
|
|
var _ DriverPlugin = &driverPluginClient{}
|
|
|
|
type driverPluginClient struct {
|
|
*base.BasePluginClient
|
|
|
|
client proto.DriverClient
|
|
logger hclog.Logger
|
|
|
|
// doneCtx is closed when the plugin exits
|
|
doneCtx context.Context
|
|
}
|
|
|
|
func (d *driverPluginClient) TaskConfigSchema() (*hclspec.Spec, error) {
|
|
req := &proto.TaskConfigSchemaRequest{}
|
|
|
|
resp, err := d.client.TaskConfigSchema(d.doneCtx, req)
|
|
if err != nil {
|
|
return nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
return resp.Spec, nil
|
|
}
|
|
|
|
func (d *driverPluginClient) Capabilities() (*Capabilities, error) {
|
|
req := &proto.CapabilitiesRequest{}
|
|
|
|
resp, err := d.client.Capabilities(d.doneCtx, req)
|
|
if err != nil {
|
|
return nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
caps := &Capabilities{}
|
|
if resp.Capabilities != nil {
|
|
caps.SendSignals = resp.Capabilities.SendSignals
|
|
caps.Exec = resp.Capabilities.Exec
|
|
caps.MustInitiateNetwork = resp.Capabilities.MustCreateNetwork
|
|
|
|
for _, mode := range resp.Capabilities.NetworkIsolationModes {
|
|
caps.NetIsolationModes = append(caps.NetIsolationModes, netIsolationModeFromProto(mode))
|
|
}
|
|
|
|
switch resp.Capabilities.FsIsolation {
|
|
case proto.DriverCapabilities_NONE:
|
|
caps.FSIsolation = FSIsolationNone
|
|
case proto.DriverCapabilities_CHROOT:
|
|
caps.FSIsolation = FSIsolationChroot
|
|
case proto.DriverCapabilities_IMAGE:
|
|
caps.FSIsolation = FSIsolationImage
|
|
default:
|
|
caps.FSIsolation = FSIsolationNone
|
|
}
|
|
|
|
caps.MountConfigs = MountConfigSupport(resp.Capabilities.MountConfigs)
|
|
caps.RemoteTasks = resp.Capabilities.RemoteTasks
|
|
caps.DisableLogCollection = resp.Capabilities.DisableLogCollection
|
|
}
|
|
|
|
return caps, nil
|
|
}
|
|
|
|
// Fingerprint the driver, return a chan that will be pushed to periodically and on changes to health
|
|
func (d *driverPluginClient) Fingerprint(ctx context.Context) (<-chan *Fingerprint, error) {
|
|
req := &proto.FingerprintRequest{}
|
|
|
|
// Join the passed context and the shutdown context
|
|
joinedCtx, _ := joincontext.Join(ctx, d.doneCtx)
|
|
|
|
stream, err := d.client.Fingerprint(joinedCtx, req)
|
|
if err != nil {
|
|
return nil, grpcutils.HandleReqCtxGrpcErr(err, ctx, d.doneCtx)
|
|
}
|
|
|
|
ch := make(chan *Fingerprint, 1)
|
|
go d.handleFingerprint(ctx, ch, stream)
|
|
|
|
return ch, nil
|
|
}
|
|
|
|
func (d *driverPluginClient) handleFingerprint(reqCtx context.Context, ch chan *Fingerprint, stream proto.Driver_FingerprintClient) {
|
|
defer close(ch)
|
|
for {
|
|
pb, err := stream.Recv()
|
|
if err != nil {
|
|
if err != io.EOF {
|
|
ch <- &Fingerprint{
|
|
Err: grpcutils.HandleReqCtxGrpcErr(err, reqCtx, d.doneCtx),
|
|
}
|
|
}
|
|
|
|
// End the stream
|
|
return
|
|
}
|
|
|
|
f := &Fingerprint{
|
|
Attributes: pstructs.ConvertProtoAttributeMap(pb.Attributes),
|
|
Health: healthStateFromProto(pb.Health),
|
|
HealthDescription: pb.HealthDescription,
|
|
}
|
|
|
|
select {
|
|
case <-reqCtx.Done():
|
|
return
|
|
case ch <- f:
|
|
}
|
|
}
|
|
}
|
|
|
|
// RecoverTask does internal state recovery to be able to control the task of
|
|
// the given TaskHandle
|
|
func (d *driverPluginClient) RecoverTask(h *TaskHandle) error {
|
|
req := &proto.RecoverTaskRequest{Handle: taskHandleToProto(h)}
|
|
|
|
_, err := d.client.RecoverTask(d.doneCtx, req)
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
// StartTask starts execution of a task with the given TaskConfig. A TaskHandle
|
|
// is returned to the caller that can be used to recover state of the task,
|
|
// should the driver crash or exit prematurely.
|
|
func (d *driverPluginClient) StartTask(c *TaskConfig) (*TaskHandle, *DriverNetwork, error) {
|
|
req := &proto.StartTaskRequest{
|
|
Task: taskConfigToProto(c),
|
|
}
|
|
|
|
resp, err := d.client.StartTask(d.doneCtx, req)
|
|
if err != nil {
|
|
st := status.Convert(err)
|
|
if len(st.Details()) > 0 {
|
|
if rec, ok := st.Details()[0].(*sproto.RecoverableError); ok {
|
|
return nil, nil, structs.NewRecoverableError(err, rec.Recoverable)
|
|
}
|
|
}
|
|
return nil, nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
var net *DriverNetwork
|
|
if resp.NetworkOverride != nil {
|
|
net = &DriverNetwork{
|
|
PortMap: map[string]int{},
|
|
IP: resp.NetworkOverride.Addr,
|
|
AutoAdvertise: resp.NetworkOverride.AutoAdvertise,
|
|
}
|
|
for k, v := range resp.NetworkOverride.PortMap {
|
|
net.PortMap[k] = int(v)
|
|
}
|
|
}
|
|
|
|
return taskHandleFromProto(resp.Handle), net, nil
|
|
}
|
|
|
|
// WaitTask returns a channel that will have an ExitResult pushed to it once when the task
|
|
// exits on its own or is killed. If WaitTask is called after the task has exited, the channel
|
|
// will immedialy return the ExitResult. WaitTask can be called multiple times for
|
|
// the same task without issue.
|
|
func (d *driverPluginClient) WaitTask(ctx context.Context, id string) (<-chan *ExitResult, error) {
|
|
ch := make(chan *ExitResult)
|
|
go d.handleWaitTask(ctx, id, ch)
|
|
return ch, nil
|
|
}
|
|
|
|
func (d *driverPluginClient) handleWaitTask(ctx context.Context, id string, ch chan *ExitResult) {
|
|
defer close(ch)
|
|
var result ExitResult
|
|
req := &proto.WaitTaskRequest{
|
|
TaskId: id,
|
|
}
|
|
|
|
// Join the passed context and the shutdown context
|
|
joinedCtx, joinedCtxCancel := joincontext.Join(ctx, d.doneCtx)
|
|
defer joinedCtxCancel()
|
|
|
|
resp, err := d.client.WaitTask(joinedCtx, req)
|
|
if err != nil {
|
|
result.Err = grpcutils.HandleReqCtxGrpcErr(err, ctx, d.doneCtx)
|
|
} else {
|
|
result.ExitCode = int(resp.Result.ExitCode)
|
|
result.Signal = int(resp.Result.Signal)
|
|
result.OOMKilled = resp.Result.OomKilled
|
|
if len(resp.Err) > 0 {
|
|
result.Err = errors.New(resp.Err)
|
|
}
|
|
}
|
|
ch <- &result
|
|
}
|
|
|
|
// StopTask stops the task with the given taskID. A timeout and signal can be
|
|
// given to control a graceful termination of the task. The driver will send the
|
|
// given signal to the task and wait for the given timeout for it to exit. If the
|
|
// task does not exit within the timeout it will be forcefully killed.
|
|
func (d *driverPluginClient) StopTask(taskID string, timeout time.Duration, signal string) error {
|
|
req := &proto.StopTaskRequest{
|
|
TaskId: taskID,
|
|
Timeout: ptypes.DurationProto(timeout),
|
|
Signal: signal,
|
|
}
|
|
|
|
_, err := d.client.StopTask(d.doneCtx, req)
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
// DestroyTask removes the task from the driver's in memory state. The task
|
|
// cannot be running unless force is set to true. If force is set to true the
|
|
// driver will forcefully terminate the task before removing it.
|
|
func (d *driverPluginClient) DestroyTask(taskID string, force bool) error {
|
|
req := &proto.DestroyTaskRequest{
|
|
TaskId: taskID,
|
|
Force: force,
|
|
}
|
|
|
|
_, err := d.client.DestroyTask(d.doneCtx, req)
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
// InspectTask returns status information for a task
|
|
func (d *driverPluginClient) InspectTask(taskID string) (*TaskStatus, error) {
|
|
req := &proto.InspectTaskRequest{TaskId: taskID}
|
|
|
|
resp, err := d.client.InspectTask(d.doneCtx, req)
|
|
if err != nil {
|
|
return nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
status, err := taskStatusFromProto(resp.Task)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if resp.Driver != nil {
|
|
status.DriverAttributes = resp.Driver.Attributes
|
|
}
|
|
if resp.NetworkOverride != nil {
|
|
status.NetworkOverride = &DriverNetwork{
|
|
PortMap: map[string]int{},
|
|
IP: resp.NetworkOverride.Addr,
|
|
AutoAdvertise: resp.NetworkOverride.AutoAdvertise,
|
|
}
|
|
for k, v := range resp.NetworkOverride.PortMap {
|
|
status.NetworkOverride.PortMap[k] = int(v)
|
|
}
|
|
}
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// TaskStats returns resource usage statistics for the task
|
|
func (d *driverPluginClient) TaskStats(ctx context.Context, taskID string, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
|
|
req := &proto.TaskStatsRequest{
|
|
TaskId: taskID,
|
|
CollectionInterval: ptypes.DurationProto(interval),
|
|
}
|
|
ctx, _ = joincontext.Join(ctx, d.doneCtx)
|
|
stream, err := d.client.TaskStats(ctx, req)
|
|
if err != nil {
|
|
st := status.Convert(err)
|
|
if len(st.Details()) > 0 {
|
|
if rec, ok := st.Details()[0].(*sproto.RecoverableError); ok {
|
|
return nil, structs.NewRecoverableError(err, rec.Recoverable)
|
|
}
|
|
}
|
|
return nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
ch := make(chan *cstructs.TaskResourceUsage, 1)
|
|
go d.handleStats(ctx, ch, stream)
|
|
|
|
return ch, nil
|
|
}
|
|
|
|
func (d *driverPluginClient) handleStats(ctx context.Context, ch chan<- *cstructs.TaskResourceUsage, stream proto.Driver_TaskStatsClient) {
|
|
defer close(ch)
|
|
for {
|
|
resp, err := stream.Recv()
|
|
if ctx.Err() != nil {
|
|
// Context canceled; exit gracefully
|
|
return
|
|
}
|
|
|
|
if err != nil {
|
|
if err != io.EOF {
|
|
d.logger.Error("error receiving stream from TaskStats driver RPC, closing stream", "error", err)
|
|
}
|
|
|
|
// End of stream
|
|
return
|
|
}
|
|
|
|
stats, err := TaskStatsFromProto(resp.Stats)
|
|
if err != nil {
|
|
d.logger.Error("failed to decode stats from RPC", "error", err, "stats", resp.Stats)
|
|
continue
|
|
}
|
|
|
|
select {
|
|
case ch <- stats:
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// TaskEvents returns a channel that will receive events from the driver about all
|
|
// tasks such as lifecycle events, terminal errors, etc.
|
|
func (d *driverPluginClient) TaskEvents(ctx context.Context) (<-chan *TaskEvent, error) {
|
|
req := &proto.TaskEventsRequest{}
|
|
|
|
// Join the passed context and the shutdown context
|
|
joinedCtx, _ := joincontext.Join(ctx, d.doneCtx)
|
|
|
|
stream, err := d.client.TaskEvents(joinedCtx, req)
|
|
if err != nil {
|
|
return nil, grpcutils.HandleReqCtxGrpcErr(err, ctx, d.doneCtx)
|
|
}
|
|
|
|
ch := make(chan *TaskEvent, 1)
|
|
go d.handleTaskEvents(ctx, ch, stream)
|
|
return ch, nil
|
|
}
|
|
|
|
func (d *driverPluginClient) handleTaskEvents(reqCtx context.Context, ch chan *TaskEvent, stream proto.Driver_TaskEventsClient) {
|
|
defer close(ch)
|
|
for {
|
|
ev, err := stream.Recv()
|
|
if err != nil {
|
|
if err != io.EOF {
|
|
ch <- &TaskEvent{
|
|
Err: grpcutils.HandleReqCtxGrpcErr(err, reqCtx, d.doneCtx),
|
|
}
|
|
}
|
|
|
|
// End the stream
|
|
return
|
|
}
|
|
|
|
timestamp, _ := ptypes.Timestamp(ev.Timestamp)
|
|
event := &TaskEvent{
|
|
TaskID: ev.TaskId,
|
|
AllocID: ev.AllocId,
|
|
TaskName: ev.TaskName,
|
|
Annotations: ev.Annotations,
|
|
Message: ev.Message,
|
|
Timestamp: timestamp,
|
|
}
|
|
select {
|
|
case <-reqCtx.Done():
|
|
return
|
|
case ch <- event:
|
|
}
|
|
}
|
|
}
|
|
|
|
// SignalTask will send the given signal to the specified task
|
|
func (d *driverPluginClient) SignalTask(taskID string, signal string) error {
|
|
req := &proto.SignalTaskRequest{
|
|
TaskId: taskID,
|
|
Signal: signal,
|
|
}
|
|
_, err := d.client.SignalTask(d.doneCtx, req)
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
// ExecTask will run the given command within the execution context of the task.
|
|
// The driver will wait for the given timeout for the command to complete before
|
|
// terminating it. The stdout and stderr of the command will be return to the caller,
|
|
// along with other exit information such as exit code.
|
|
func (d *driverPluginClient) ExecTask(taskID string, cmd []string, timeout time.Duration) (*ExecTaskResult, error) {
|
|
req := &proto.ExecTaskRequest{
|
|
TaskId: taskID,
|
|
Command: cmd,
|
|
Timeout: ptypes.DurationProto(timeout),
|
|
}
|
|
|
|
resp, err := d.client.ExecTask(d.doneCtx, req)
|
|
if err != nil {
|
|
return nil, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
result := &ExecTaskResult{
|
|
Stdout: resp.Stdout,
|
|
Stderr: resp.Stderr,
|
|
ExitResult: exitResultFromProto(resp.Result),
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
var _ ExecTaskStreamingRawDriver = (*driverPluginClient)(nil)
|
|
|
|
func (d *driverPluginClient) ExecTaskStreamingRaw(ctx context.Context,
|
|
taskID string,
|
|
command []string,
|
|
tty bool,
|
|
execStream ExecTaskStream) error {
|
|
|
|
stream, err := d.client.ExecTaskStreaming(ctx)
|
|
if err != nil {
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
err = stream.Send(&proto.ExecTaskStreamingRequest{
|
|
Setup: &proto.ExecTaskStreamingRequest_Setup{
|
|
TaskId: taskID,
|
|
Command: command,
|
|
Tty: tty,
|
|
},
|
|
})
|
|
if err != nil {
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
errCh := make(chan error, 1)
|
|
|
|
go func() {
|
|
for {
|
|
m, err := execStream.Recv()
|
|
if err == io.EOF {
|
|
return
|
|
} else if err != nil {
|
|
errCh <- err
|
|
return
|
|
}
|
|
|
|
if err := stream.Send(m); err != nil {
|
|
errCh <- err
|
|
return
|
|
}
|
|
|
|
}
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
case err := <-errCh:
|
|
return err
|
|
default:
|
|
}
|
|
|
|
m, err := stream.Recv()
|
|
if err == io.EOF {
|
|
// Once we get to the end of stream successfully, we can ignore errCh:
|
|
// e.g. input write failures after process terminates shouldn't cause method to fail
|
|
return nil
|
|
} else if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := execStream.Send(m); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
var _ DriverNetworkManager = (*driverPluginClient)(nil)
|
|
|
|
func (d *driverPluginClient) CreateNetwork(allocID string, _ *NetworkCreateRequest) (*NetworkIsolationSpec, bool, error) {
|
|
req := &proto.CreateNetworkRequest{
|
|
AllocId: allocID,
|
|
}
|
|
|
|
resp, err := d.client.CreateNetwork(d.doneCtx, req)
|
|
if err != nil {
|
|
return nil, false, grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
return NetworkIsolationSpecFromProto(resp.IsolationSpec), resp.Created, nil
|
|
}
|
|
|
|
func (d *driverPluginClient) DestroyNetwork(allocID string, spec *NetworkIsolationSpec) error {
|
|
if spec == nil {
|
|
return nil
|
|
}
|
|
|
|
req := &proto.DestroyNetworkRequest{
|
|
AllocId: allocID,
|
|
IsolationSpec: NetworkIsolationSpecToProto(spec),
|
|
}
|
|
|
|
_, err := d.client.DestroyNetwork(d.doneCtx, req)
|
|
if err != nil {
|
|
return grpcutils.HandleGrpcErr(err, d.doneCtx)
|
|
}
|
|
|
|
return nil
|
|
}
|