From c7b1966565a0524000f352dcb933f0701e8c50ce Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Fri, 15 Sep 2023 08:16:38 -0500 Subject: [PATCH] backport of commit 1339599185af9dbfcca6f0aa1001c6753b8c682b (#18517) Co-authored-by: Gerard Nguyen --- .changelog/18463.txt | 3 ++ api/agent.go | 22 ++++++++- api/agent_test.go | 48 +++++++++++++++++++ command/agent/agent_endpoint.go | 11 ++++- command/server_force_leave.go | 26 ++++++++-- nomad/server.go | 5 ++ website/content/api-docs/agent.mdx | 6 ++- .../docs/commands/server/force-leave.mdx | 19 +++++++- 8 files changed, 132 insertions(+), 8 deletions(-) create mode 100644 .changelog/18463.txt diff --git a/.changelog/18463.txt b/.changelog/18463.txt new file mode 100644 index 000000000..3ab4f5923 --- /dev/null +++ b/.changelog/18463.txt @@ -0,0 +1,3 @@ +```release-note:feature +cli: Add `-prune` flag to `nomad operator force-leave` command +``` \ No newline at end of file diff --git a/api/agent.go b/api/agent.go index 521215803..82497f524 100644 --- a/api/agent.go +++ b/api/agent.go @@ -35,6 +35,12 @@ type KeyringRequest struct { Key string } +// ForceLeaveOpts are used to configure the ForceLeave method. +type ForceLeaveOpts struct { + // Prune indicates whether to remove a node from the list of members + Prune bool +} + // Agent returns a new agent which can be used to query // the agent-specific endpoints. func (c *Client) Agent() *Agent { @@ -163,7 +169,21 @@ func (a *Agent) MembersOpts(opts *QueryOptions) (*ServerMembers, error) { // ForceLeave is used to eject an existing node from the cluster. func (a *Agent) ForceLeave(node string) error { - _, err := a.client.put("/v1/agent/force-leave?node="+node, nil, nil, nil) + v := url.Values{} + v.Add("node", node) + _, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil) + return err +} + +// ForceLeaveWithOptions is used to eject an existing node from the cluster +// with additional options such as prune. +func (a *Agent) ForceLeaveWithOptions(node string, opts ForceLeaveOpts) error { + v := url.Values{} + v.Add("node", node) + if opts.Prune { + v.Add("prune", "1") + } + _, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil) return err } diff --git a/api/agent_test.go b/api/agent_test.go index 8432cdd11..edf7ac63b 100644 --- a/api/agent_test.go +++ b/api/agent_test.go @@ -126,6 +126,7 @@ func TestAgent_ForceLeave(t *testing.T) { must.One(t, n) membersBefore, err := a.MembersOpts(&QueryOptions{}) + must.NoError(t, err) must.Eq(t, membersBefore.Members[1].Status, "alive") err = a.ForceLeave(membersBefore.Members[1].Name) @@ -152,6 +153,53 @@ func TestAgent_ForceLeave(t *testing.T) { wait.Timeout(3*time.Second), wait.Gap(100*time.Millisecond), )) + +} + +func TestAgent_ForceLeavePrune(t *testing.T) { + testutil.Parallel(t) + + c, s := makeClient(t, nil, nil) + defer s.Stop() + a := c.Agent() + + nodeName := "foo" + _, s2 := makeClient(t, nil, func(c *testutil.TestServerConfig) { + c.NodeName = nodeName + c.Server.BootstrapExpect = 0 + }) + + n, err := a.Join(s2.SerfAddr) + must.NoError(t, err) + must.One(t, n) + membersBefore, err := a.MembersOpts(&QueryOptions{}) + must.NoError(t, err) + + s2.Stop() + + forceLeaveOpts := ForceLeaveOpts{ + Prune: true, + } + nodeName = nodeName + ".global" + err = a.ForceLeaveWithOptions(nodeName, forceLeaveOpts) + must.NoError(t, err) + + f := func() error { + membersAfter, err := a.MembersOpts(&QueryOptions{}) + if err != nil { + return err + } + if len(membersAfter.Members) == len(membersBefore.Members) { + return fmt.Errorf("node did not get pruned") + } + return nil + } + must.Wait(t, wait.InitialSuccess( + wait.ErrorFunc(f), + wait.Timeout(5*time.Second), + wait.Gap(100*time.Millisecond), + )) + } func (a *AgentMember) String() string { diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index e5361af02..afd7991c9 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -322,8 +322,17 @@ func (s *HTTPServer) AgentForceLeaveRequest(resp http.ResponseWriter, req *http. return nil, CodedError(400, "missing node to force leave") } + prune, err := parseBool(req, "prune") + if err != nil { + return nil, CodedError(400, "invalid prune value") + } + // Attempt remove - err := srv.RemoveFailedNode(node) + if prune != nil && *prune { + err = srv.RemoveFailedNodePrune(node) + } else { + err = srv.RemoveFailedNode(node) + } return nil, err } diff --git a/command/server_force_leave.go b/command/server_force_leave.go index df05026ae..3e82a8c01 100644 --- a/command/server_force_leave.go +++ b/command/server_force_leave.go @@ -7,6 +7,7 @@ import ( "fmt" "strings" + "github.com/hashicorp/nomad/api" "github.com/posener/complete" ) @@ -21,14 +22,22 @@ Usage: nomad server force-leave [options] Forces an server to enter the "left" state. This can be used to eject nodes which have failed and will not rejoin the cluster. Note that if the member is actually still alive, it will - eventually rejoin the cluster again. + eventually rejoin the cluster again. The failed or left server will + be garbage collected after 24h. If ACLs are enabled, this option requires a token with the 'agent:write' capability. General Options: - ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + ` + +Server Force-Leave Options: + + -prune + Removes failed or left server from the Serf member list immediately. + If member is actually still alive, it will eventually rejoin the cluster again. +` return strings.TrimSpace(helpText) } @@ -37,7 +46,10 @@ func (c *ServerForceLeaveCommand) Synopsis() string { } func (c *ServerForceLeaveCommand) AutocompleteFlags() complete.Flags { - return c.Meta.AutocompleteFlags(FlagSetClient) + return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), + complete.Flags{ + "-prune": complete.PredictNothing, + }) } func (c *ServerForceLeaveCommand) AutocompleteArgs() complete.Predictor { @@ -47,8 +59,11 @@ func (c *ServerForceLeaveCommand) AutocompleteArgs() complete.Predictor { func (c *ServerForceLeaveCommand) Name() string { return "server force-leave" } func (c *ServerForceLeaveCommand) Run(args []string) int { + var prune bool flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } + flags.BoolVar(&prune, "prune", false, "Remove server completely from list of members") + if err := flags.Parse(args); err != nil { return 1 } @@ -70,7 +85,10 @@ func (c *ServerForceLeaveCommand) Run(args []string) int { } // Call force-leave on the node - if err := client.Agent().ForceLeave(node); err != nil { + forceLeaveOpts := api.ForceLeaveOpts{ + Prune: prune, + } + if err := client.Agent().ForceLeaveWithOptions(node, forceLeaveOpts); err != nil { c.Ui.Error(fmt.Sprintf("Error force-leaving server %s: %s", node, err)) return 1 } diff --git a/nomad/server.go b/nomad/server.go index b2eae020b..c3252f8c6 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1866,6 +1866,11 @@ func (s *Server) RemoveFailedNode(node string) error { return s.serf.RemoveFailedNode(node) } +// RemoveFailedNodePrune immediately removes a failed node from the list of members +func (s *Server) RemoveFailedNodePrune(node string) error { + return s.serf.RemoveFailedNodePrune(node) +} + // KeyManager returns the Serf keyring manager func (s *Server) KeyManager() *serf.KeyManager { return s.serf.KeyManager() diff --git a/website/content/api-docs/agent.mdx b/website/content/api-docs/agent.mdx index 5e70a2af6..57dca70fa 100644 --- a/website/content/api-docs/agent.mdx +++ b/website/content/api-docs/agent.mdx @@ -441,13 +441,17 @@ The table below shows this endpoint's support for ### Parameters - `node` `(string: )` - Specifies the name of the node to force leave. +- `prune` `(boolean: )` - Removes failed or left server from the Serf + member list immediately. If member is actually still alive, it will eventually rejoin + the cluster again. + ### Sample Request ```shell-session $ curl \ --request POST \ - https://localhost:4646/v1/agent/force-leave?node=client-ab2e23dc + https://localhost:4646/v1/agent/force-leave?node=client-ab2e23dc&prune=true ``` ## Health diff --git a/website/content/docs/commands/server/force-leave.mdx b/website/content/docs/commands/server/force-leave.mdx index d31ab75e7..cb7f001f8 100644 --- a/website/content/docs/commands/server/force-leave.mdx +++ b/website/content/docs/commands/server/force-leave.mdx @@ -10,7 +10,9 @@ description: > The `server force-leave` command forces a server to enter the "left" state. This can be used to eject server nodes which have failed and will not rejoin -the cluster. Note that if the server is actually still alive, it will +the cluster. The failed or left server will be garbage collected after `24h`. + +~> Note that if the server is actually still alive, it will eventually rejoin the cluster again. ## Usage @@ -22,6 +24,9 @@ nomad server force-leave [options] This command expects only one argument - the node which should be forced to enter the "left" state. +Additionally, by specifying the `prune` flag, a failed or left node can be forcibly removed +from the list of members immediately. + If ACLs are enabled, this option requires a token with the `agent:write` capability. @@ -29,6 +34,11 @@ capability. @include 'general_options_no_namespace.mdx' +## Server Force-Leave Options + +- `-prune`: Removes failed or left server from the Serf member list immediately. + If member is actually still alive, it will eventually rejoin the cluster again. + ## Examples Force-leave the server "node1": @@ -37,3 +47,10 @@ Force-leave the server "node1": $ nomad server force-leave node1 ``` + +Force-leave the server "node1" and prune it: + +```shell-session +$ nomad server force-leave -prune node1 + +``` \ No newline at end of file