Add logic to allow changing a failed node's ID
This commit is contained in:
parent
5ab580990b
commit
bf09061e86
|
@ -1335,8 +1335,13 @@ AFTER_CHECK:
|
|||
|
||||
// If there's existing information about the node, do not
|
||||
// clobber it.
|
||||
SkipNodeUpdate: true,
|
||||
//SkipNodeUpdate: true,
|
||||
}
|
||||
if node != nil {
|
||||
req.TaggedAddresses = node.TaggedAddresses
|
||||
req.NodeMeta = node.Meta
|
||||
}
|
||||
|
||||
_, err = s.raftApply(structs.RegisterRequestType, &req)
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -953,6 +953,78 @@ func TestLeader_ChangeServerID(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestLeader_ChangeNodeID(t *testing.T) {
|
||||
t.Parallel()
|
||||
conf := func(c *Config) {
|
||||
c.Bootstrap = false
|
||||
c.BootstrapExpect = 3
|
||||
c.Datacenter = "dc1"
|
||||
}
|
||||
dir1, s1 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
|
||||
dir2, s2 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir2)
|
||||
defer s2.Shutdown()
|
||||
|
||||
dir3, s3 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir3)
|
||||
defer s3.Shutdown()
|
||||
|
||||
servers := []*Server{s1, s2, s3}
|
||||
|
||||
// Try to join and wait for all servers to get promoted
|
||||
joinLAN(t, s2, s1)
|
||||
joinLAN(t, s3, s1)
|
||||
for _, s := range servers {
|
||||
testrpc.WaitForTestAgent(t, s.RPC, "dc1")
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
||||
}
|
||||
|
||||
// Shut down a server, freeing up its address/port
|
||||
s3.Shutdown()
|
||||
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
alive := 0
|
||||
for _, m := range s1.LANMembers() {
|
||||
if m.Status == serf.StatusAlive {
|
||||
alive++
|
||||
}
|
||||
}
|
||||
if got, want := alive, 2; got != want {
|
||||
r.Fatalf("got %d alive members want %d", got, want)
|
||||
}
|
||||
})
|
||||
|
||||
// Bring up a new server with s3's address that will get a different ID
|
||||
dir4, s4 := testServerWithConfig(t, func(c *Config) {
|
||||
c.Bootstrap = false
|
||||
c.Datacenter = "dc1"
|
||||
c.NodeName = s3.config.NodeName
|
||||
})
|
||||
defer os.RemoveAll(dir4)
|
||||
defer s4.Shutdown()
|
||||
joinLAN(t, s4, s1)
|
||||
servers[2] = s4
|
||||
|
||||
// Make sure the dead server is removed and we're back to 3 total peers
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
r.Check(wantRaft(servers))
|
||||
for _, s := range servers {
|
||||
r.Check(wantPeers(s, 3))
|
||||
}
|
||||
})
|
||||
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
for _, m := range s1.LANMembers() {
|
||||
if m.Status != serf.StatusAlive {
|
||||
r.Fatalf("bad status: %v", m)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestLeader_ACL_Initialization(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
|
|
@ -369,7 +369,22 @@ func (s *Store) ensureNoNodeWithSimilarNameTxn(tx *memdb.Txn, node *structs.Node
|
|||
for nodeIt := enodes.Next(); nodeIt != nil; nodeIt = enodes.Next() {
|
||||
enode := nodeIt.(*structs.Node)
|
||||
if strings.EqualFold(node.Node, enode.Node) && node.ID != enode.ID {
|
||||
if !(enode.ID == "" && allowClashWithoutID) {
|
||||
// Look up the existing node's Serf health check to see if it's failed.
|
||||
// If it is, the node can be renamed.
|
||||
enodeCheck, err := tx.First("checks", "id", enode.Node, string(structs.SerfCheckID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("Cannot get status of node %s: %s", enode.Node, err)
|
||||
}
|
||||
if enodeCheck == nil {
|
||||
return fmt.Errorf("Cannot rename node %s: Serf health check not found for existing node", enode.Node)
|
||||
}
|
||||
|
||||
enodeSerfCheck, ok := enodeCheck.(*structs.HealthCheck)
|
||||
if !ok {
|
||||
return fmt.Errorf("Existing node %q's Serf health check has type %T", enode.Node, enodeSerfCheck)
|
||||
}
|
||||
|
||||
if !((enode.ID == "" || enodeSerfCheck.Status == api.HealthCritical) && allowClashWithoutID) {
|
||||
return fmt.Errorf("Node name %s is reserved by node %s with name %s", node.Node, enode.ID, enode.Node)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue