Merge pull request #5577 from hashicorp/dani/b-logmon-unrecoverable

logging: Attempt to recover logmon failures
This commit is contained in:
Mahmood Ali 2019-04-22 14:40:24 -04:00 committed by GitHub
commit f515b93b5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 11 deletions

View File

@ -95,22 +95,29 @@ func reattachConfigFromHookData(data map[string]string) (*plugin.ReattachConfig,
func (h *logmonHook) Prestart(ctx context.Context,
req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
// Create a logmon client by reattaching or launching a new instance
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
// Attempt to reattach to logmon
if h.logmonPluginClient == nil {
reattachConfig, err := reattachConfigFromHookData(req.PreviousState)
if err != nil {
h.logger.Error("failed to load reattach config", "error", err)
return err
}
if reattachConfig != nil {
if err := h.launchLogMon(reattachConfig); err != nil {
h.logger.Warn("failed to reattach to logmon process", "error", err)
}
}
// Launch or reattach logmon instance for the task.
if err := h.launchLogMon(reattachConfig); err != nil {
// Retry errors launching logmon as logmon may have crashed and
}
// We did not reattach to a plugin and one is still not running.
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
if err := h.launchLogMon(nil); err != nil {
// Retry errors launching logmon as logmon may have crashed on start and
// subsequent attempts will start a new one.
h.logger.Error("failed to launch logmon process", "error", err)
return structs.NewRecoverableError(err, true)
}
}
err := h.logmon.Start(&logmon.LogConfig{

View File

@ -14,14 +14,13 @@ import (
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)
// TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the
// Nomad client is restarting and asserts failing to reattach to logmon causes
// a recoverable error (task restart).
// nomad to spawn a new logmon.
func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
t.Parallel()
@ -46,6 +45,7 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
defer hook.Stop(context.Background(), nil, nil)
origState := resp.State
origHookData := resp.State[logmonReattachKey]
require.NotEmpty(t, origHookData)
@ -80,9 +80,8 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
}
resp = interfaces.TaskPrestartResponse{}
err = hook.Prestart(context.Background(), &req, &resp)
require.Error(t, err)
require.True(t, structs.IsRecoverable(err))
require.Empty(t, resp.State)
require.NoError(t, err)
require.NotEqual(t, origState, resp.State)
// Running stop should shutdown logmon
require.NoError(t, hook.Stop(context.Background(), nil, nil))