backport of commit 1339599185af9dbfcca6f0aa1001c6753b8c682b (#18517)

Co-authored-by: Gerard Nguyen <nguyenvanthao1991@gmail.com>
This commit is contained in:
hc-github-team-nomad-core 2023-09-15 08:16:38 -05:00 committed by GitHub
parent 1425eecbbe
commit c7b1966565
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 132 additions and 8 deletions

3
.changelog/18463.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:feature
cli: Add `-prune` flag to `nomad operator force-leave` command
```

View File

@ -35,6 +35,12 @@ type KeyringRequest struct {
Key string
}
// ForceLeaveOpts are used to configure the ForceLeave method.
type ForceLeaveOpts struct {
// Prune indicates whether to remove a node from the list of members
Prune bool
}
// Agent returns a new agent which can be used to query
// the agent-specific endpoints.
func (c *Client) Agent() *Agent {
@ -163,7 +169,21 @@ func (a *Agent) MembersOpts(opts *QueryOptions) (*ServerMembers, error) {
// ForceLeave is used to eject an existing node from the cluster.
func (a *Agent) ForceLeave(node string) error {
_, err := a.client.put("/v1/agent/force-leave?node="+node, nil, nil, nil)
v := url.Values{}
v.Add("node", node)
_, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil)
return err
}
// ForceLeaveWithOptions is used to eject an existing node from the cluster
// with additional options such as prune.
func (a *Agent) ForceLeaveWithOptions(node string, opts ForceLeaveOpts) error {
v := url.Values{}
v.Add("node", node)
if opts.Prune {
v.Add("prune", "1")
}
_, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil)
return err
}

View File

@ -126,6 +126,7 @@ func TestAgent_ForceLeave(t *testing.T) {
must.One(t, n)
membersBefore, err := a.MembersOpts(&QueryOptions{})
must.NoError(t, err)
must.Eq(t, membersBefore.Members[1].Status, "alive")
err = a.ForceLeave(membersBefore.Members[1].Name)
@ -152,6 +153,53 @@ func TestAgent_ForceLeave(t *testing.T) {
wait.Timeout(3*time.Second),
wait.Gap(100*time.Millisecond),
))
}
func TestAgent_ForceLeavePrune(t *testing.T) {
testutil.Parallel(t)
c, s := makeClient(t, nil, nil)
defer s.Stop()
a := c.Agent()
nodeName := "foo"
_, s2 := makeClient(t, nil, func(c *testutil.TestServerConfig) {
c.NodeName = nodeName
c.Server.BootstrapExpect = 0
})
n, err := a.Join(s2.SerfAddr)
must.NoError(t, err)
must.One(t, n)
membersBefore, err := a.MembersOpts(&QueryOptions{})
must.NoError(t, err)
s2.Stop()
forceLeaveOpts := ForceLeaveOpts{
Prune: true,
}
nodeName = nodeName + ".global"
err = a.ForceLeaveWithOptions(nodeName, forceLeaveOpts)
must.NoError(t, err)
f := func() error {
membersAfter, err := a.MembersOpts(&QueryOptions{})
if err != nil {
return err
}
if len(membersAfter.Members) == len(membersBefore.Members) {
return fmt.Errorf("node did not get pruned")
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(5*time.Second),
wait.Gap(100*time.Millisecond),
))
}
func (a *AgentMember) String() string {

View File

@ -322,8 +322,17 @@ func (s *HTTPServer) AgentForceLeaveRequest(resp http.ResponseWriter, req *http.
return nil, CodedError(400, "missing node to force leave")
}
prune, err := parseBool(req, "prune")
if err != nil {
return nil, CodedError(400, "invalid prune value")
}
// Attempt remove
err := srv.RemoveFailedNode(node)
if prune != nil && *prune {
err = srv.RemoveFailedNodePrune(node)
} else {
err = srv.RemoveFailedNode(node)
}
return nil, err
}

View File

@ -7,6 +7,7 @@ import (
"fmt"
"strings"
"github.com/hashicorp/nomad/api"
"github.com/posener/complete"
)
@ -21,14 +22,22 @@ Usage: nomad server force-leave [options] <node>
Forces an server to enter the "left" state. This can be used to
eject nodes which have failed and will not rejoin the cluster.
Note that if the member is actually still alive, it will
eventually rejoin the cluster again.
eventually rejoin the cluster again. The failed or left server will
be garbage collected after 24h.
If ACLs are enabled, this option requires a token with the 'agent:write'
capability.
General Options:
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace)
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
Server Force-Leave Options:
-prune
Removes failed or left server from the Serf member list immediately.
If member is actually still alive, it will eventually rejoin the cluster again.
`
return strings.TrimSpace(helpText)
}
@ -37,7 +46,10 @@ func (c *ServerForceLeaveCommand) Synopsis() string {
}
func (c *ServerForceLeaveCommand) AutocompleteFlags() complete.Flags {
return c.Meta.AutocompleteFlags(FlagSetClient)
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
complete.Flags{
"-prune": complete.PredictNothing,
})
}
func (c *ServerForceLeaveCommand) AutocompleteArgs() complete.Predictor {
@ -47,8 +59,11 @@ func (c *ServerForceLeaveCommand) AutocompleteArgs() complete.Predictor {
func (c *ServerForceLeaveCommand) Name() string { return "server force-leave" }
func (c *ServerForceLeaveCommand) Run(args []string) int {
var prune bool
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.BoolVar(&prune, "prune", false, "Remove server completely from list of members")
if err := flags.Parse(args); err != nil {
return 1
}
@ -70,7 +85,10 @@ func (c *ServerForceLeaveCommand) Run(args []string) int {
}
// Call force-leave on the node
if err := client.Agent().ForceLeave(node); err != nil {
forceLeaveOpts := api.ForceLeaveOpts{
Prune: prune,
}
if err := client.Agent().ForceLeaveWithOptions(node, forceLeaveOpts); err != nil {
c.Ui.Error(fmt.Sprintf("Error force-leaving server %s: %s", node, err))
return 1
}

View File

@ -1866,6 +1866,11 @@ func (s *Server) RemoveFailedNode(node string) error {
return s.serf.RemoveFailedNode(node)
}
// RemoveFailedNodePrune immediately removes a failed node from the list of members
func (s *Server) RemoveFailedNodePrune(node string) error {
return s.serf.RemoveFailedNodePrune(node)
}
// KeyManager returns the Serf keyring manager
func (s *Server) KeyManager() *serf.KeyManager {
return s.serf.KeyManager()

View File

@ -441,13 +441,17 @@ The table below shows this endpoint's support for
### Parameters
- `node` `(string: <required>)` - Specifies the name of the node to force leave.
- `prune` `(boolean: <optional>)` - Removes failed or left server from the Serf
member list immediately. If member is actually still alive, it will eventually rejoin
the cluster again.
### Sample Request
```shell-session
$ curl \
--request POST \
https://localhost:4646/v1/agent/force-leave?node=client-ab2e23dc
https://localhost:4646/v1/agent/force-leave?node=client-ab2e23dc&prune=true
```
## Health

View File

@ -10,7 +10,9 @@ description: >
The `server force-leave` command forces a server to enter the "left" state.
This can be used to eject server nodes which have failed and will not rejoin
the cluster. Note that if the server is actually still alive, it will
the cluster. The failed or left server will be garbage collected after `24h`.
~> Note that if the server is actually still alive, it will
eventually rejoin the cluster again.
## Usage
@ -22,6 +24,9 @@ nomad server force-leave [options] <node>
This command expects only one argument - the node which should be forced
to enter the "left" state.
Additionally, by specifying the `prune` flag, a failed or left node can be forcibly removed
from the list of members immediately.
If ACLs are enabled, this option requires a token with the `agent:write`
capability.
@ -29,6 +34,11 @@ capability.
@include 'general_options_no_namespace.mdx'
## Server Force-Leave Options
- `-prune`: Removes failed or left server from the Serf member list immediately.
If member is actually still alive, it will eventually rejoin the cluster again.
## Examples
Force-leave the server "node1":
@ -37,3 +47,10 @@ Force-leave the server "node1":
$ nomad server force-leave node1
```
Force-leave the server "node1" and prune it:
```shell-session
$ nomad server force-leave -prune node1
```