agent: add variation of force-leave that exclusively works on the WAN (#11722)
Fixes #6548
This commit is contained in:
parent
d2f53d20ac
commit
6ec84cfbe2
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvement
|
||||||
|
agent: add variation of force-leave that exclusively works on the WAN
|
||||||
|
```
|
|
@ -1553,9 +1553,10 @@ func (a *Agent) RefreshPrimaryGatewayFallbackAddresses(addrs []string) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ForceLeave is used to remove a failed node from the cluster
|
// ForceLeave is used to remove a failed node from the cluster
|
||||||
func (a *Agent) ForceLeave(node string, prune bool, entMeta *structs.EnterpriseMeta) (err error) {
|
func (a *Agent) ForceLeave(node string, prune bool, entMeta *structs.EnterpriseMeta) error {
|
||||||
a.logger.Info("Force leaving node", "node", node)
|
a.logger.Info("Force leaving node", "node", node)
|
||||||
err = a.delegate.RemoveFailedNode(node, prune, entMeta)
|
|
||||||
|
err := a.delegate.RemoveFailedNode(node, prune, entMeta)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
a.logger.Warn("Failed to remove node",
|
a.logger.Warn("Failed to remove node",
|
||||||
"node", node,
|
"node", node,
|
||||||
|
@ -1565,6 +1566,25 @@ func (a *Agent) ForceLeave(node string, prune bool, entMeta *structs.EnterpriseM
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ForceLeaveWAN is used to remove a failed node from the WAN cluster
|
||||||
|
func (a *Agent) ForceLeaveWAN(node string, prune bool, entMeta *structs.EnterpriseMeta) error {
|
||||||
|
a.logger.Info("(WAN) Force leaving node", "node", node)
|
||||||
|
|
||||||
|
srv, ok := a.delegate.(*consul.Server)
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("Must be a server to force-leave a node from the WAN cluster")
|
||||||
|
}
|
||||||
|
|
||||||
|
err := srv.RemoveFailedNodeWAN(node, prune, entMeta)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Warn("(WAN) Failed to remove node",
|
||||||
|
"node", node,
|
||||||
|
"error", err,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// AgentLocalMember is used to retrieve the LAN member for the local node.
|
// AgentLocalMember is used to retrieve the LAN member for the local node.
|
||||||
func (a *Agent) AgentLocalMember() serf.Member {
|
func (a *Agent) AgentLocalMember() serf.Member {
|
||||||
return a.delegate.AgentLocalMember()
|
return a.delegate.AgentLocalMember()
|
||||||
|
|
|
@ -640,9 +640,16 @@ func (s *HTTPHandlers) AgentForceLeave(resp http.ResponseWriter, req *http.Reque
|
||||||
// Check the value of the prune query
|
// Check the value of the prune query
|
||||||
_, prune := req.URL.Query()["prune"]
|
_, prune := req.URL.Query()["prune"]
|
||||||
|
|
||||||
|
// Check if the WAN is being queried
|
||||||
|
_, wan := req.URL.Query()["wan"]
|
||||||
|
|
||||||
addr := strings.TrimPrefix(req.URL.Path, "/v1/agent/force-leave/")
|
addr := strings.TrimPrefix(req.URL.Path, "/v1/agent/force-leave/")
|
||||||
|
if wan {
|
||||||
|
return nil, s.agent.ForceLeaveWAN(addr, prune, entMeta)
|
||||||
|
} else {
|
||||||
return nil, s.agent.ForceLeave(addr, prune, entMeta)
|
return nil, s.agent.ForceLeave(addr, prune, entMeta)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// syncChanges is a helper function which wraps a blocking call to sync
|
// syncChanges is a helper function which wraps a blocking call to sync
|
||||||
// services and checks to the server. If the operation fails, we only
|
// services and checks to the server. If the operation fails, we only
|
||||||
|
|
|
@ -2265,6 +2265,74 @@ func TestAgent_ForceLeavePrune(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgent_ForceLeavePrune_WAN(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("too slow for testing.Short")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
a1 := StartTestAgent(t, TestAgent{Name: "dc1", HCL: `
|
||||||
|
datacenter = "dc1"
|
||||||
|
primary_datacenter = "dc1"
|
||||||
|
gossip_wan {
|
||||||
|
probe_interval = "50ms"
|
||||||
|
suspicion_mult = 2
|
||||||
|
}
|
||||||
|
`})
|
||||||
|
defer a1.Shutdown()
|
||||||
|
|
||||||
|
a2 := StartTestAgent(t, TestAgent{Name: "dc2", HCL: `
|
||||||
|
datacenter = "dc2"
|
||||||
|
primary_datacenter = "dc1"
|
||||||
|
`})
|
||||||
|
defer a2.Shutdown()
|
||||||
|
|
||||||
|
testrpc.WaitForLeader(t, a1.RPC, "dc1")
|
||||||
|
testrpc.WaitForLeader(t, a2.RPC, "dc2")
|
||||||
|
|
||||||
|
// Wait for the WAN join.
|
||||||
|
addr := fmt.Sprintf("127.0.0.1:%d", a1.Config.SerfPortWAN)
|
||||||
|
_, err := a2.JoinWAN([]string{addr})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
testrpc.WaitForLeader(t, a1.RPC, "dc2")
|
||||||
|
testrpc.WaitForLeader(t, a2.RPC, "dc1")
|
||||||
|
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Len(r, a1.WANMembers(), 2)
|
||||||
|
require.Len(r, a2.WANMembers(), 2)
|
||||||
|
})
|
||||||
|
|
||||||
|
wanNodeName_a2 := a2.Config.NodeName + ".dc2"
|
||||||
|
|
||||||
|
// Shutdown and wait for agent being marked as failed, so we wait for full
|
||||||
|
// shutdown of Agent.
|
||||||
|
require.NoError(t, a2.Shutdown())
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
m := a1.WANMembers()
|
||||||
|
for _, member := range m {
|
||||||
|
if member.Name == wanNodeName_a2 {
|
||||||
|
if member.Status != serf.StatusFailed {
|
||||||
|
r.Fatalf("got status %q want %q", member.Status, serf.StatusFailed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// Force leave now
|
||||||
|
req, err := http.NewRequest("PUT", fmt.Sprintf("/v1/agent/force-leave/%s?prune=1&wan=1", wanNodeName_a2), nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
resp := httptest.NewRecorder()
|
||||||
|
a1.srv.h.ServeHTTP(resp, req)
|
||||||
|
require.Equal(t, http.StatusOK, resp.Code, resp.Body.String())
|
||||||
|
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Len(r, a1.WANMembers(), 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestAgent_RegisterCheck(t *testing.T) {
|
func TestAgent_RegisterCheck(t *testing.T) {
|
||||||
if testing.Short() {
|
if testing.Short() {
|
||||||
t.Skip("too slow for testing.Short")
|
t.Skip("too slow for testing.Short")
|
||||||
|
|
|
@ -1193,6 +1193,18 @@ func (s *Server) RemoveFailedNode(node string, prune bool, entMeta *structs.Ente
|
||||||
return s.removeFailedNode(removeFn, node, wanNode, entMeta)
|
return s.removeFailedNode(removeFn, node, wanNode, entMeta)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RemoveFailedNodeWAN is used to remove a failed node from the WAN cluster.
|
||||||
|
func (s *Server) RemoveFailedNodeWAN(wanNode string, prune bool, entMeta *structs.EnterpriseMeta) error {
|
||||||
|
var removeFn func(*serf.Serf, string) error
|
||||||
|
if prune {
|
||||||
|
removeFn = (*serf.Serf).RemoveFailedNodePrune
|
||||||
|
} else {
|
||||||
|
removeFn = (*serf.Serf).RemoveFailedNode
|
||||||
|
}
|
||||||
|
|
||||||
|
return s.removeFailedNode(removeFn, "", wanNode, entMeta)
|
||||||
|
}
|
||||||
|
|
||||||
// IsLeader checks if this server is the cluster leader
|
// IsLeader checks if this server is the cluster leader
|
||||||
func (s *Server) IsLeader() bool {
|
func (s *Server) IsLeader() bool {
|
||||||
return s.raft.State() == raft.Leader
|
return s.raft.State() == raft.Leader
|
||||||
|
|
|
@ -26,6 +26,8 @@ func (s *Server) JoinLAN(addrs []string, entMeta *structs.EnterpriseMeta) (int,
|
||||||
}
|
}
|
||||||
|
|
||||||
// removeFailedNode is used to remove a failed node from the cluster
|
// removeFailedNode is used to remove a failed node from the cluster
|
||||||
|
//
|
||||||
|
// if node is empty, just remove wanNode from the WAN
|
||||||
func (s *Server) removeFailedNode(
|
func (s *Server) removeFailedNode(
|
||||||
removeFn func(*serf.Serf, string) error,
|
removeFn func(*serf.Serf, string) error,
|
||||||
node, wanNode string,
|
node, wanNode string,
|
||||||
|
@ -42,11 +44,13 @@ func (s *Server) removeFailedNode(
|
||||||
|
|
||||||
var merr error
|
var merr error
|
||||||
|
|
||||||
|
if node != "" {
|
||||||
if found, err := maybeRemove(s.serfLAN, node); err != nil {
|
if found, err := maybeRemove(s.serfLAN, node); err != nil {
|
||||||
merr = multierror.Append(merr, fmt.Errorf("could not remove failed node from LAN: %w", err))
|
merr = multierror.Append(merr, fmt.Errorf("could not remove failed node from LAN: %w", err))
|
||||||
} else if found {
|
} else if found {
|
||||||
foundAny = true
|
foundAny = true
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if s.serfWAN != nil {
|
if s.serfWAN != nil {
|
||||||
if found, err := maybeRemove(s.serfWAN, wanNode); err != nil {
|
if found, err := maybeRemove(s.serfWAN, wanNode); err != nil {
|
||||||
|
|
31
api/agent.go
31
api/agent.go
|
@ -1021,25 +1021,36 @@ func (a *Agent) Leave() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type ForceLeaveOpts struct {
|
||||||
|
// Prune indicates if we should remove a failed agent from the list of
|
||||||
|
// members in addition to ejecting it.
|
||||||
|
Prune bool
|
||||||
|
|
||||||
|
// WAN indicates that the request should exclusively target the WAN pool.
|
||||||
|
WAN bool
|
||||||
|
}
|
||||||
|
|
||||||
// ForceLeave is used to have the agent eject a failed node
|
// ForceLeave is used to have the agent eject a failed node
|
||||||
func (a *Agent) ForceLeave(node string) error {
|
func (a *Agent) ForceLeave(node string) error {
|
||||||
r := a.c.newRequest("PUT", "/v1/agent/force-leave/"+node)
|
return a.ForceLeaveOpts(node, ForceLeaveOpts{})
|
||||||
_, resp, err := a.c.doRequest(r)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer closeResponseBody(resp)
|
|
||||||
if err := requireOK(resp); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ForceLeavePrune is used to have an a failed agent removed
|
// ForceLeavePrune is used to have an a failed agent removed
|
||||||
// from the list of members
|
// from the list of members
|
||||||
func (a *Agent) ForceLeavePrune(node string) error {
|
func (a *Agent) ForceLeavePrune(node string) error {
|
||||||
|
return a.ForceLeaveOpts(node, ForceLeaveOpts{Prune: true})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ForceLeaveOpts is used to have the agent eject a failed node or remove it
|
||||||
|
// completely from the list of members.
|
||||||
|
func (a *Agent) ForceLeaveOpts(node string, opts ForceLeaveOpts) error {
|
||||||
r := a.c.newRequest("PUT", "/v1/agent/force-leave/"+node)
|
r := a.c.newRequest("PUT", "/v1/agent/force-leave/"+node)
|
||||||
|
if opts.Prune {
|
||||||
r.params.Set("prune", "1")
|
r.params.Set("prune", "1")
|
||||||
|
}
|
||||||
|
if opts.WAN {
|
||||||
|
r.params.Set("wan", "1")
|
||||||
|
}
|
||||||
_, resp, err := a.c.doRequest(r)
|
_, resp, err := a.c.doRequest(r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -6,6 +6,7 @@ import (
|
||||||
|
|
||||||
"github.com/mitchellh/cli"
|
"github.com/mitchellh/cli"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/api"
|
||||||
"github.com/hashicorp/consul/command/flags"
|
"github.com/hashicorp/consul/command/flags"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -23,12 +24,15 @@ type cmd struct {
|
||||||
|
|
||||||
// flags
|
// flags
|
||||||
prune bool
|
prune bool
|
||||||
|
wan bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *cmd) init() {
|
func (c *cmd) init() {
|
||||||
c.flags = flag.NewFlagSet("", flag.ContinueOnError)
|
c.flags = flag.NewFlagSet("", flag.ContinueOnError)
|
||||||
c.flags.BoolVar(&c.prune, "prune", false,
|
c.flags.BoolVar(&c.prune, "prune", false,
|
||||||
"Remove agent completely from list of members")
|
"Remove agent completely from list of members")
|
||||||
|
c.flags.BoolVar(&c.wan, "wan", false,
|
||||||
|
"Exclusively leave the agent from the WAN serf pool.")
|
||||||
c.http = &flags.HTTPFlags{}
|
c.http = &flags.HTTPFlags{}
|
||||||
flags.Merge(c.flags, c.http.ClientFlags())
|
flags.Merge(c.flags, c.http.ClientFlags())
|
||||||
flags.Merge(c.flags, c.http.PartitionFlag())
|
flags.Merge(c.flags, c.http.PartitionFlag())
|
||||||
|
@ -54,12 +58,10 @@ func (c *cmd) Run(args []string) int {
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.prune {
|
err = client.Agent().ForceLeaveOpts(nodes[0], api.ForceLeaveOpts{
|
||||||
err = client.Agent().ForceLeavePrune(nodes[0])
|
Prune: c.prune,
|
||||||
} else {
|
WAN: c.wan,
|
||||||
err = client.Agent().ForceLeave(nodes[0])
|
})
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.UI.Error(fmt.Sprintf("Error force leaving: %s", err))
|
c.UI.Error(fmt.Sprintf("Error force leaving: %s", err))
|
||||||
return 1
|
return 1
|
||||||
|
@ -88,4 +90,5 @@ Usage: consul force-leave [options] name
|
||||||
time before eventually reaping them.
|
time before eventually reaping them.
|
||||||
|
|
||||||
-prune Remove agent completely from list of members
|
-prune Remove agent completely from list of members
|
||||||
|
-wan Exclusively leave the agent from the WAN serf pool.
|
||||||
`
|
`
|
||||||
|
|
Loading…
Reference in New Issue