Merge pull request #5577 from hashicorp/dani/b-logmon-unrecoverable
logging: Attempt to recover logmon failures
This commit is contained in:
commit
f515b93b5e
|
@ -95,22 +95,29 @@ func reattachConfigFromHookData(data map[string]string) (*plugin.ReattachConfig,
|
|||
func (h *logmonHook) Prestart(ctx context.Context,
|
||||
req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
|
||||
|
||||
// Create a logmon client by reattaching or launching a new instance
|
||||
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
|
||||
// Attempt to reattach to logmon
|
||||
if h.logmonPluginClient == nil {
|
||||
reattachConfig, err := reattachConfigFromHookData(req.PreviousState)
|
||||
if err != nil {
|
||||
h.logger.Error("failed to load reattach config", "error", err)
|
||||
return err
|
||||
}
|
||||
if reattachConfig != nil {
|
||||
if err := h.launchLogMon(reattachConfig); err != nil {
|
||||
h.logger.Warn("failed to reattach to logmon process", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Launch or reattach logmon instance for the task.
|
||||
if err := h.launchLogMon(reattachConfig); err != nil {
|
||||
// Retry errors launching logmon as logmon may have crashed and
|
||||
}
|
||||
|
||||
// We did not reattach to a plugin and one is still not running.
|
||||
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
|
||||
if err := h.launchLogMon(nil); err != nil {
|
||||
// Retry errors launching logmon as logmon may have crashed on start and
|
||||
// subsequent attempts will start a new one.
|
||||
h.logger.Error("failed to launch logmon process", "error", err)
|
||||
return structs.NewRecoverableError(err, true)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
err := h.logmon.Start(&logmon.LogConfig{
|
||||
|
|
|
@ -14,14 +14,13 @@ import (
|
|||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the
|
||||
// Nomad client is restarting and asserts failing to reattach to logmon causes
|
||||
// a recoverable error (task restart).
|
||||
// nomad to spawn a new logmon.
|
||||
func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
@ -46,6 +45,7 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
|
|||
require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
|
||||
defer hook.Stop(context.Background(), nil, nil)
|
||||
|
||||
origState := resp.State
|
||||
origHookData := resp.State[logmonReattachKey]
|
||||
require.NotEmpty(t, origHookData)
|
||||
|
||||
|
@ -80,9 +80,8 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
|
|||
}
|
||||
resp = interfaces.TaskPrestartResponse{}
|
||||
err = hook.Prestart(context.Background(), &req, &resp)
|
||||
require.Error(t, err)
|
||||
require.True(t, structs.IsRecoverable(err))
|
||||
require.Empty(t, resp.State)
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, origState, resp.State)
|
||||
|
||||
// Running stop should shutdown logmon
|
||||
require.NoError(t, hook.Stop(context.Background(), nil, nil))
|
||||
|
|
Loading…
Reference in New Issue