replication: fix potential panic during upgrades (#17476)

If the authoritative region has been upgraded to a version of Nomad that has new
replicated objects (such as ACL Auth Methods, ACL Binding Rules, etc.), the
non-authoritative regions will start replicating those objects as soon as their
leader is upgraded. If a server in the non-authoritative region is upgraded and
then becomes the leader before all the other servers in the region have been
upgraded, then it will attempt to write a Raft log entry that the followers
don't understand. The followers will then panic.

Add same the minimum version checks that we do for RPC writes to the leader's
replication loop.
This commit is contained in:
Tim Gross 2023-06-12 08:53:56 -04:00 committed by GitHub
parent 8bd3bdab42
commit e3a37c0b97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 2 deletions

3
.changelog/17476.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
replication: Fix a potential panic when a non-authoritative region is upgraded and a server with the new version becomes the leader.
```

View File

@ -58,7 +58,7 @@ var minACLRoleVersion = version.Must(version.NewVersion("1.4.0"))
// minACLAuthMethodVersion is the Nomad version at which the ACL auth methods
// table was introduced. It forms the minimum version all federated servers must
// meet before the feature can be used.
var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0-beta.1"))
var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0"))
// minACLJWTAuthMethodVersion is the Nomad version at which the ACL JWT auth method type
// was introduced. It forms the minimum version all federated servers must
@ -68,7 +68,7 @@ var minACLJWTAuthMethodVersion = version.Must(version.NewVersion("1.5.4"))
// minACLBindingRuleVersion is the Nomad version at which the ACL binding rules
// table was introduced. It forms the minimum version all federated servers
// must meet before the feature can be used.
var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0-beta.1"))
var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0"))
// minNomadServiceRegistrationVersion is the Nomad version at which the service
// registrations table was introduced. It forms the minimum version all local
@ -1848,6 +1848,17 @@ func (s *Server) replicateACLRoles(stopCh chan struct{}) {
// parameters are controlled internally.
_ = limiter.Wait(context.Background())
if !ServersMeetMinimumVersion(
s.serf.Members(), s.Region(), minACLRoleVersion, true) {
s.logger.Trace(
"all servers must be upgraded to 1.4.0 or later before ACL Roles can be replicated")
if s.replicationBackoffContinue(stopCh) {
continue
} else {
return
}
}
// Set the replication token on each replication iteration so that
// it is always current and can handle agent SIGHUP reloads.
req.AuthToken = s.ReplicationToken()
@ -2046,6 +2057,17 @@ func (s *Server) replicateACLAuthMethods(stopCh chan struct{}) {
// parameters are controlled internally.
_ = limiter.Wait(context.Background())
if !ServersMeetMinimumVersion(
s.serf.Members(), s.Region(), minACLAuthMethodVersion, true) {
s.logger.Trace(
"all servers must be upgraded to 1.5.0 or later before ACL Auth Methods can be replicated")
if s.replicationBackoffContinue(stopCh) {
continue
} else {
return
}
}
// Set the replication token on each replication iteration so that
// it is always current and can handle agent SIGHUP reloads.
req.AuthToken = s.ReplicationToken()
@ -2241,6 +2263,17 @@ func (s *Server) replicateACLBindingRules(stopCh chan struct{}) {
// parameters are controlled internally.
_ = limiter.Wait(context.Background())
if !ServersMeetMinimumVersion(
s.serf.Members(), s.Region(), minACLBindingRuleVersion, true) {
s.logger.Trace(
"all servers must be upgraded to 1.5.0 or later before ACL Binding Rules can be replicated")
if s.replicationBackoffContinue(stopCh) {
continue
} else {
return
}
}
// Set the replication token on each replication iteration so that
// it is always current and can handle agent SIGHUP reloads.
req.AuthToken = s.ReplicationToken()