Fix rejoin_after_leave behavior (#15552)

2023-01-11 15:39:24 -06:00 · 2023-01-11 15:39:24 -06:00 · 1c32471805
parent 7d1059b5ae
commit 1c32471805
5 changed files with 18 additions and 2 deletions
--- a/.changelog/15552.txt
+++ b/.changelog/15552.txt
@ -0,0 +1,3 @@
+```release-note:bug
+server: Fixed a bug where rejoin_after_leave config was not being respected
+```
--- a/command/agent/agent.go
+++ b/command/agent/agent.go
@ -309,6 +309,7 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
 	conf.RPCAddr.IP = rpcAddr.IP
 	conf.SerfConfig.MemberlistConfig.BindPort = serfAddr.Port
 	conf.SerfConfig.MemberlistConfig.BindAddr = serfAddr.IP.String()
+	conf.SerfConfig.RejoinAfterLeave = agentConfig.Server.RejoinAfterLeave

 	// Set up the advertise addresses
 	rpcAddr, err = net.ResolveTCPAddr("tcp", agentConfig.AdvertiseAddrs.RPC)
--- a/command/agent/config.go
+++ b/command/agent/config.go
@ -547,7 +547,7 @@ type ServerConfig struct {
 	RetryIntervalHCL string `hcl:"retry_interval" json:"-"`

 	// RejoinAfterLeave controls our interaction with the cluster after leave.
-	// When set to false (default), a leave causes Consul to not rejoin
+	// When set to false (default), a leave causes Nomad to not rejoin
 	// the cluster until an explicit join is received. If this is set to
 	// true, we ignore the leave, and rejoin the cluster on start.
 	RejoinAfterLeave bool `hcl:"rejoin_after_leave"`
--- a/nomad/server.go
+++ b/nomad/server.go
@ -1531,7 +1531,6 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
 			return nil, err
 		}
 	}
-	conf.RejoinAfterLeave = true
 	// LeavePropagateDelay is used to make sure broadcasted leave intents propagate
 	// This value was tuned using https://www.serf.io/docs/internals/simulator.html to
 	// allow for convergence in 99.9% of nodes in a 10 node cluster
--- a/website/content/docs/upgrade/upgrade-specific.mdx
+++ b/website/content/docs/upgrade/upgrade-specific.mdx
@ -62,6 +62,18 @@ from the Nomad client by setting [`set_environment_variables`][artifact_env].
 The use of filesystem isolation can be disabled in Client configuration by
 setting [`disable_filesystem_isolation`][artifact_fs_isolation].

+#### Server `rejoin_after_leave` (default: `false`) now enforced
+
+All Nomad versions prior to v1.5.0 have incorrectly ignored the Server [`rejoin_after_leave`]
+configuration option. This bug has been fixed in Nomad version v1.5.0.
+
+Previous to v1.5.0 the behavior of Nomad `rejoin_after_leave` was always `true`, regardless of 
+Nomad server configuration, while the documentation incorrectly indicated a default of `false`.
+
+Cluster operators should be aware that explicit `leave` events (such as `nomad server force-leave`) 
+will now result in behavior which matches this configuration, and should review whether they
+were inadvertently relying on the buggy behavior.
+
 ## Nomad 1.4.0

 #### Possible Panic During Upgrades
@ -1545,6 +1557,7 @@ deleted and then Nomad 0.3.0 can be launched.
 [`sidecar_task.config`]: /docs/job-specification/sidecar_task#config
 [`raft_protocol`]: /docs/configuration/server#raft_protocol
 [`raft protocol`]: /docs/configuration/server#raft_protocol
+[`rejoin_after_leave`]: /docs/configuration/server#rejoin_after_leave
 [reserved]: /docs/configuration/client#reserved-parameters
 [task-config]: /docs/job-specification/task#config
 [tls-guide]: https://learn.hashicorp.com/tutorials/nomad/security-enable-tls