diff --git a/.changelog/15469.txt b/.changelog/15469.txt new file mode 100644 index 000000000..027e5b2fa --- /dev/null +++ b/.changelog/15469.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cli: add a nomad operator client state command +``` diff --git a/command/commands.go b/command/commands.go index 029293e43..8ffa7d125 100644 --- a/command/commands.go +++ b/command/commands.go @@ -614,6 +614,12 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory { Meta: meta, }, nil }, + + "operator client-state": func() (cli.Command, error) { + return &OperatorClientStateCommand{ + Meta: meta, + }, nil + }, "operator debug": func() (cli.Command, error) { return &OperatorDebugCommand{ Meta: meta, diff --git a/command/operator_client_state.go b/command/operator_client_state.go new file mode 100644 index 000000000..ce8da4e8e --- /dev/null +++ b/command/operator_client_state.go @@ -0,0 +1,129 @@ +package command + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/hashicorp/go-hclog" + trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" + "github.com/hashicorp/nomad/client/state" + "github.com/posener/complete" +) + +type OperatorClientStateCommand struct { + Meta +} + +func (c *OperatorClientStateCommand) Help() string { + helpText := ` +Usage: nomad operator client-state + + Emits a representation of the stored client state in JSON format. +` + return strings.TrimSpace(helpText) +} +func (c *OperatorClientStateCommand) AutocompleteFlags() complete.Flags { + return complete.Flags{} +} + +func (c *OperatorClientStateCommand) AutocompleteArgs() complete.Predictor { + return complete.PredictNothing +} + +func (c *OperatorClientStateCommand) Synopsis() string { + return "Dump the nomad client state" +} +func (c *OperatorClientStateCommand) Name() string { return "operator client-state" } + +func (c *OperatorClientStateCommand) Run(args []string) int { + if len(args) != 1 { + c.Ui.Error("This command takes one argument: ") + c.Ui.Error(commandErrorText(c)) + + return 1 + } + + logger := hclog.L() + db, err := state.NewBoltStateDB(logger, args[0]) + if err != nil { + c.Ui.Error(fmt.Sprintf("failed to open client state: %v", err)) + return 1 + } + defer db.Close() + + allocs, _, err := db.GetAllAllocations() + if err != nil { + c.Ui.Error(fmt.Sprintf("failed to get allocations: %v", err)) + return 1 + } + + data := map[string]*clientStateAlloc{} + for _, alloc := range allocs { + allocID := alloc.ID + deployState, err := db.GetDeploymentStatus(allocID) + if err != nil { + c.Ui.Error(fmt.Sprintf("failed to get deployment status for %s: %v", allocID, err)) + return 1 + } + + tasks := map[string]*taskState{} + tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + for _, jt := range tg.Tasks { + ls, rs, err := db.GetTaskRunnerState(allocID, jt.Name) + if err != nil { + c.Ui.Error(fmt.Sprintf("failed to get task runner state %s: %v", allocID, err)) + return 1 + } + + var ds interface{} + if ls.TaskHandle == nil { + continue + } + err = ls.TaskHandle.GetDriverState(&ds) + if err != nil { + c.Ui.Error(fmt.Sprintf("failed to parse driver state %s: %v", allocID, err)) + return 1 + } + + tasks[jt.Name] = &taskState{ + LocalState: ls, + RemoteState: rs, + DriverState: ds, + } + } + + data[allocID] = &clientStateAlloc{ + Alloc: alloc, + DeployStatus: deployState, + Tasks: tasks, + } + } + output := debugOutput{ + Allocations: data, + } + bytes, err := json.Marshal(output) + if err != nil { + c.Ui.Error(fmt.Sprintf("failed to serialize client state: %v", err)) + return 1 + } + c.Ui.Output(string(bytes)) + + return 0 +} + +type debugOutput struct { + Allocations map[string]*clientStateAlloc +} + +type clientStateAlloc struct { + Alloc any + DeployStatus any + Tasks map[string]*taskState +} + +type taskState struct { + LocalState *trstate.LocalState + RemoteState any + DriverState interface{} +} diff --git a/command/operator_client_state_test.go b/command/operator_client_state_test.go new file mode 100644 index 000000000..3ae55a251 --- /dev/null +++ b/command/operator_client_state_test.go @@ -0,0 +1,29 @@ +package command + +import ( + "strings" + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/mitchellh/cli" + "github.com/stretchr/testify/require" +) + +func TestOperatorClientStateCommand(t *testing.T) { + ci.Parallel(t) + ui := cli.NewMockUi() + cmd := &OperatorClientStateCommand{Meta: Meta{Ui: ui}} + + failedCode := cmd.Run([]string{"some", "bad", "args"}) + require.Equal(t, 1, failedCode) + if out := ui.ErrorWriter.String(); !strings.Contains(out, commandErrorText(cmd)) { + t.Fatalf("expected help output, got: %s", out) + } + ui.ErrorWriter.Reset() + + dir := t.TempDir() + code := cmd.Run([]string{dir}) + + require.Equal(t, 0, code) + require.Contains(t, ui.OutputWriter.String(), "{}") +} diff --git a/website/content/docs/commands/operator/client-state.mdx b/website/content/docs/commands/operator/client-state.mdx new file mode 100644 index 000000000..e654e208b --- /dev/null +++ b/website/content/docs/commands/operator/client-state.mdx @@ -0,0 +1,679 @@ +--- +layout: docs +page_title: 'Commands: operator client-state' +description: > +The `operator client-state` command generates a representation of the +stored client state in JSON format. +--- + +# Command: operator client-state + +The `operator client-state` command generates a representation of the +stored client state in JSON format. + +## Usage + +```plaintext +nomad operator client-state +``` + +## Example + +The output of this command can be piped to `jq` for further filtering and analysis: + +```shell-session +$ nomad operator client-state | jq +{ + "Allocations": { + "3b0ed734-f721-45d3-420a-3d96926b3f1d": { + "Alloc": { + "ID": "3b0ed734-f721-45d3-420a-3d96926b3f1d", + "Namespace": "default", + "EvalID": "042fbfeb-0c75-e696-b9b8-e3b4328a4988", + "Name": "docs.example[0]", + "NodeID": "00d48d89-d512-3ee6-4b95-271b72415916", + "NodeName": "YOUR_NOMAD_NAME_HERE", + "JobID": "docs", + "Job": { + "Stop": false, + "Region": "global", + "Namespace": "default", + "ID": "docs", + "ParentID": "", + "Name": "docs", + "Type": "service", + "Priority": 50, + "AllAtOnce": false, + "Datacenters": [ + "dc1" + ], + "Constraints": null, + "Affinities": null, + "Spreads": null, + "TaskGroups": [ + { + "Name": "example", + "Count": 1, + "Update": { + "Stagger": 30000000000, + "MaxParallel": 1, + "HealthCheck": "checks", + "MinHealthyTime": 10000000000, + "HealthyDeadline": 300000000000, + "ProgressDeadline": 600000000000, + "AutoRevert": false, + "AutoPromote": false, + "Canary": 0 + }, + "Migrate": { + "MaxParallel": 1, + "HealthCheck": "checks", + "MinHealthyTime": 10000000000, + "HealthyDeadline": 300000000000 + }, + "Constraints": null, + "Scaling": null, + "RestartPolicy": { + "Attempts": 2, + "Interval": 1800000000000, + "Delay": 15000000000, + "Mode": "fail" + }, + "Tasks": [ + { + "Name": "server", + "Driver": "docker", + "User": "", + "Config": { + "args": [ + "-listen", + ":5678", + "-text", + "hello world" + ], + "image": "hashicorp/http-echo", + "ports": [ + "http" + ] + }, + "Env": null, + "Services": null, + "Vault": null, + "Templates": null, + "Constraints": null, + "Affinities": null, + "Resources": { + "CPU": 100, + "Cores": 0, + "MemoryMB": 300, + "MemoryMaxMB": 0, + "DiskMB": 0, + "IOPS": 0, + "Networks": null, + "Devices": null + }, + "RestartPolicy": { + "Attempts": 2, + "Interval": 1800000000000, + "Delay": 15000000000, + "Mode": "fail" + }, + "DispatchPayload": null, + "Lifecycle": null, + "Meta": null, + "KillTimeout": 5000000000, + "LogConfig": { + "MaxFiles": 10, + "MaxFileSizeMB": 10 + }, + "Artifacts": null, + "Leader": false, + "ShutdownDelay": 0, + "VolumeMounts": null, + "ScalingPolicies": null, + "KillSignal": "", + "Kind": "", + "CSIPluginConfig": null + } + ], + "EphemeralDisk": { + "Sticky": false, + "SizeMB": 300, + "Migrate": false + }, + "Meta": null, + "ReschedulePolicy": { + "Attempts": 0, + "Interval": 0, + "Delay": 30000000000, + "DelayFunction": "exponential", + "MaxDelay": 3600000000000, + "Unlimited": true + }, + "Affinities": null, + "Spreads": null, + "Networks": [ + { + "Mode": "", + "Device": "", + "CIDR": "", + "IP": "", + "MBits": 0, + "DNS": null, + "ReservedPorts": [ + { + "Label": "http", + "Value": 5678, + "To": 0, + "HostNetwork": "default" + } + ], + "DynamicPorts": null + } + ], + "Consul": { + "Namespace": "" + }, + "Services": null, + "Volumes": null, + "ShutdownDelay": null, + "StopAfterClientDisconnect": null, + "MaxClientDisconnect": null + } + ], + "Update": { + "Stagger": 30000000000, + "MaxParallel": 1, + "HealthCheck": "", + "MinHealthyTime": 0, + "HealthyDeadline": 0, + "ProgressDeadline": 0, + "AutoRevert": false, + "AutoPromote": false, + "Canary": 0 + }, + "Multiregion": null, + "Periodic": null, + "ParameterizedJob": null, + "Dispatched": false, + "DispatchIdempotencyToken": "", + "Payload": null, + "Meta": null, + "ConsulToken": "", + "ConsulNamespace": "", + "VaultToken": "", + "VaultNamespace": "", + "NomadTokenID": "", + "Status": "pending", + "StatusDescription": "", + "Stable": false, + "Version": 0, + "SubmitTime": 1670925631564348000, + "CreateIndex": 14, + "ModifyIndex": 14, + "JobModifyIndex": 14 + }, + "TaskGroup": "example", + "Resources": { + "CPU": 100, + "Cores": 0, + "MemoryMB": 300, + "MemoryMaxMB": 300, + "DiskMB": 300, + "IOPS": 0, + "Networks": [ + { + "Mode": "", + "Device": "", + "CIDR": "", + "IP": "192.168.1.9", + "MBits": 0, + "DNS": null, + "ReservedPorts": [ + { + "Label": "http", + "Value": 5678, + "To": 0, + "HostNetwork": "default" + } + ], + "DynamicPorts": null + } + ], + "Devices": null + }, + "SharedResources": { + "CPU": 0, + "Cores": 0, + "MemoryMB": 0, + "MemoryMaxMB": 0, + "DiskMB": 300, + "IOPS": 0, + "Networks": [ + { + "Mode": "", + "Device": "", + "CIDR": "", + "IP": "192.168.1.9", + "MBits": 0, + "DNS": null, + "ReservedPorts": [ + { + "Label": "http", + "Value": 5678, + "To": 0, + "HostNetwork": "default" + } + ], + "DynamicPorts": null + } + ], + "Devices": null + }, + "TaskResources": { + "server": { + "CPU": 100, + "Cores": 0, + "MemoryMB": 300, + "MemoryMaxMB": 0, + "DiskMB": 0, + "IOPS": 0, + "Networks": null, + "Devices": null + } + }, + "AllocatedResources": { + "Tasks": { + "server": { + "Cpu": { + "CpuShares": 100, + "ReservedCores": null + }, + "Memory": { + "MemoryMB": 300, + "MemoryMaxMB": 0 + }, + "Networks": null, + "Devices": null + } + }, + "TaskLifecycles": { + "server": null + }, + "Shared": { + "Networks": [ + { + "Mode": "", + "Device": "", + "CIDR": "", + "IP": "192.168.1.9", + "MBits": 0, + "DNS": null, + "ReservedPorts": [ + { + "Label": "http", + "Value": 5678, + "To": 0, + "HostNetwork": "default" + } + ], + "DynamicPorts": null + } + ], + "DiskMB": 300, + "Ports": [ + { + "Label": "http", + "Value": 5678, + "To": 0, + "HostIP": "192.168.1.9" + } + ] + } + }, + "Metrics": { + "NodesEvaluated": 1, + "NodesFiltered": 0, + "NodesAvailable": { + "dc1": 1 + }, + "ClassFiltered": null, + "ConstraintFiltered": null, + "NodesExhausted": 0, + "ClassExhausted": null, + "DimensionExhausted": null, + "QuotaExhausted": null, + "ResourcesExhausted": null, + "Scores": null, + "ScoreMetaData": [ + { + "NodeID": "00d48d89-d512-3ee6-4b95-271b72415916", + "Scores": { + "binpack": 0.014787748194725047, + "job-anti-affinity": 0, + "node-affinity": 0, + "node-reschedule-penalty": 0 + }, + "NormScore": 0.014787748194725047 + } + ], + "AllocationTime": 64877, + "CoalescedFailures": 0 + }, + "DesiredStatus": "run", + "DesiredDescription": "", + "DesiredTransition": { + "Migrate": null, + "Reschedule": null, + "ForceReschedule": null, + "NoShutdownDelay": null + }, + "ClientStatus": "pending", + "ClientDescription": "", + "TaskStates": null, + "AllocStates": null, + "PreviousAllocation": "", + "NextAllocation": "", + "DeploymentID": "dc4c0c22-3bc7-a17a-5d7b-bce06a692293", + "DeploymentStatus": null, + "RescheduleTracker": null, + "NetworkStatus": null, + "FollowupEvalID": "", + "PreemptedAllocations": null, + "PreemptedByAllocation": "", + "SigningKeyID": "e596c865-adad-78cc-0266-94640594e5a2", + "CreateIndex": 15, + "ModifyIndex": 15, + "AllocModifyIndex": 15, + "CreateTime": 1670925631566156300, + "ModifyTime": 1670925631566156300 + }, + "DeployStatus": { + "Healthy": true, + "Timestamp": "2022-12-13T18:00:45.351354859+08:00", + "Canary": false, + "ModifyIndex": 0 + }, + "Tasks": { + "server": { + "LocalState": { + "Hooks": { + "artifacts": { + "PrestartDone": true, + "Data": null, + "Env": null + }, + "devices": { + "PrestartDone": true, + "Data": null, + "Env": null + }, + "dispatch_payload": { + "PrestartDone": true, + "Data": null, + "Env": null + }, + "identity": { + "PrestartDone": false, + "Data": null, + "Env": null + }, + "logmon": { + "PrestartDone": false, + "Data": { + "reattach_config": "{\"Protocol\":\"grpc\",\"Network\":\"unix\",\"Addr\":\"/tmp/plugin2391577039\",\"Pid\":97576}" + }, + "Env": null + }, + "script_checks": { + "PrestartDone": false, + "Data": null, + "Env": null + }, + "task_dir": { + "PrestartDone": false, + "Data": { + "is_done": "true" + }, + "Env": null + }, + "validate": { + "PrestartDone": true, + "Data": null, + "Env": null + }, + "volumes": { + "PrestartDone": false, + "Data": null, + "Env": null + } + }, + "DriverNetwork": { + "PortMap": null, + "IP": "172.17.0.2", + "AutoAdvertise": false + }, + "TaskHandle": { + "Version": 1, + "Config": { + "ID": "3b0ed734-f721-45d3-420a-3d96926b3f1d/server/c7ffd87f", + "JobName": "docs", + "JobID": "docs", + "TaskGroupName": "example", + "Name": "server", + "Namespace": "default", + "NodeName": "YOUR_NOMAD_NAME_HERE", + "NodeID": "00d48d89-d512-3ee6-4b95-271b72415916", + "Env": { + "NOMAD_ADDR_http": "192.168.1.9:5678", + "NOMAD_ALLOC_DIR": "/alloc", + "NOMAD_ALLOC_ID": "3b0ed734-f721-45d3-420a-3d96926b3f1d", + "NOMAD_ALLOC_INDEX": "0", + "NOMAD_ALLOC_NAME": "docs.example[0]", + "NOMAD_ALLOC_PORT_http": "5678", + "NOMAD_CPU_LIMIT": "100", + "NOMAD_DC": "dc1", + "NOMAD_GROUP_NAME": "example", + "NOMAD_HOST_ADDR_http": "192.168.1.9:5678", + "NOMAD_HOST_IP_http": "192.168.1.9", + "NOMAD_HOST_PORT_http": "5678", + "NOMAD_IP_http": "192.168.1.9", + "NOMAD_JOB_ID": "docs", + "NOMAD_JOB_NAME": "docs", + "NOMAD_MEMORY_LIMIT": "300", + "NOMAD_NAMESPACE": "default", + "NOMAD_PARENT_CGROUP": "/nomad", + "NOMAD_PORT_http": "5678", + "NOMAD_REGION": "global", + "NOMAD_SECRETS_DIR": "/secrets", + "NOMAD_SHORT_ALLOC_ID": "3b0ed734", + "NOMAD_TASK_DIR": "/local", + "NOMAD_TASK_NAME": "server" + }, + "DeviceEnv": {}, + "Resources": { + "NomadResources": { + "Cpu": { + "CpuShares": 100, + "ReservedCores": null + }, + "Memory": { + "MemoryMB": 300, + "MemoryMaxMB": 0 + }, + "Networks": null, + "Devices": null + }, + "LinuxResources": { + "CPUPeriod": 0, + "CPUQuota": 0, + "CPUShares": 100, + "MemoryLimitBytes": 314572800, + "OOMScoreAdj": 0, + "CpusetCpus": "", + "CpusetCgroupPath": "", + "PercentTicks": 0.0023148148148148147 + }, + "Ports": [ + { + "Label": "http", + "Value": 5678, + "To": 0, + "HostIP": "192.168.1.9" + } + ] + }, + "Devices": null, + "Mounts": null, + "User": "", + "AllocDir": "/home/test/alloc/3b0ed734-f721-45d3-420a-3d96926b3f1d", + "StdoutPath": "/home/test/alloc/3b0ed734-f721-45d3-420a-3d96926b3f1d/alloc/logs/.server.stdout.fifo", + "StderrPath": "/home/test/alloc/3b0ed734-f721-45d3-420a-3d96926b3f1d/alloc/logs/.server.stderr.fifo", + "AllocID": "3b0ed734-f721-45d3-420a-3d96926b3f1d", + "NetworkIsolation": null, + "DNS": null + }, + "State": "", + "DriverState": "g6tDb250YWluZXJJRNoAQDdkNmQxMGVjZTY1YmQ2ZjY0MDk1YzdiMWI2NjViMzUyOTI1NDc5NmQ0YzE3ODY2YzdlNmE1ZTE2YWQ0NzI3YzitRHJpdmVyTmV0d29ya4OtQXV0b0FkdmVydGlzZcKiSVCqMTcyLjE3LjAuMqdQb3J0TWFwwK5SZWF0dGFjaENvbmZpZ4SkQWRkcrUvdG1wL3BsdWdpbjE4Mjc2MjczOTCnTmV0d29ya6R1bml4o1BpZNIAAX2qqFByb3RvY29spGdycGM=" + }, + "RunComplete": false + }, + "RemoteState": { + "State": "running", + "Failed": false, + "Restarts": 0, + "LastRestart": "0001-01-01T00:00:00Z", + "StartedAt": "2022-12-13T10:00:35.350231151Z", + "FinishedAt": "0001-01-01T00:00:00Z", + "Events": [ + { + "Type": "Received", + "Time": 1670925631571274800, + "Message": "", + "DisplayMessage": "Task received by client", + "Details": {}, + "FailsTask": false, + "RestartReason": "", + "SetupError": "", + "DriverError": "", + "ExitCode": 0, + "Signal": 0, + "KillTimeout": 0, + "KillError": "", + "KillReason": "", + "StartDelay": 0, + "DownloadError": "", + "ValidationError": "", + "DiskLimit": 0, + "FailedSibling": "", + "VaultError": "", + "TaskSignalReason": "", + "TaskSignal": "", + "DriverMessage": "", + "GenericSource": "" + }, + { + "Type": "Task Setup", + "Time": 1670925631572548900, + "Message": "Building Task Directory", + "DisplayMessage": "Building Task Directory", + "Details": { + "message": "Building Task Directory" + }, + "FailsTask": false, + "RestartReason": "", + "SetupError": "", + "DriverError": "", + "ExitCode": 0, + "Signal": 0, + "KillTimeout": 0, + "KillError": "", + "KillReason": "", + "StartDelay": 0, + "DownloadError": "", + "ValidationError": "", + "DiskLimit": 0, + "FailedSibling": "", + "VaultError": "", + "TaskSignalReason": "", + "TaskSignal": "", + "DriverMessage": "", + "GenericSource": "" + }, + { + "Type": "Driver", + "Time": 1670925631589523700, + "Message": "", + "DisplayMessage": "Downloading image", + "Details": { + "image": "hashicorp/http-echo:latest" + }, + "FailsTask": false, + "RestartReason": "", + "SetupError": "", + "DriverError": "", + "ExitCode": 0, + "Signal": 0, + "KillTimeout": 0, + "KillError": "", + "KillReason": "", + "StartDelay": 0, + "DownloadError": "", + "ValidationError": "", + "DiskLimit": 0, + "FailedSibling": "", + "VaultError": "", + "TaskSignalReason": "", + "TaskSignal": "", + "DriverMessage": "Downloading image", + "GenericSource": "" + }, + { + "Type": "Started", + "Time": 1670925635350228700, + "Message": "", + "DisplayMessage": "Task started by client", + "Details": {}, + "FailsTask": false, + "RestartReason": "", + "SetupError": "", + "DriverError": "", + "ExitCode": 0, + "Signal": 0, + "KillTimeout": 0, + "KillError": "", + "KillReason": "", + "StartDelay": 0, + "DownloadError": "", + "ValidationError": "", + "DiskLimit": 0, + "FailedSibling": "", + "VaultError": "", + "TaskSignalReason": "", + "TaskSignal": "", + "DriverMessage": "", + "GenericSource": "" + } + ], + "TaskHandle": null + }, + "DriverState": { + "ContainerID": "7d6d10ece65bd6f64095c7b1b665b3529254796d4c17866c7e6a5e16ad4727c8", + "DriverNetwork": { + "AutoAdvertise": false, + "IP": "172.17.0.2", + "PortMap": null + }, + "ReattachConfig": { + "Addr": "/tmp/plugin1827627390", + "Network": "unix", + "Pid": 97706, + "Protocol": "grpc" + } + } + } + } + } + } +} +``` diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 39e471368..f01f5fde6 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -659,6 +659,10 @@ } ] }, + { + "title": "client-state", + "path": "commands/operator/client-state" + }, { "title": "debug", "path": "commands/operator/debug"