fix panic from keyring raft entries being written during upgrade (#14821)

During an upgrade to Nomad 1.4.0, if a server running 1.4.0 becomes the leader
before one of the 1.3.x servers, the old server will crash because the keyring
is initialized and writes a raft entry.

Wait until all members are on a version that supports the keyring before
initializing it.
This commit is contained in:
Tim Gross 2022-10-06 12:47:02 -04:00 committed by GitHub
parent 36c644aaf2
commit 80ec5e1346
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 41 additions and 15 deletions

3
.changelog/14821.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
keyring: Fixed a panic that can occur during upgrades to 1.4.0 when initializing the keyring
```

View File

@ -277,6 +277,9 @@ func (e *Encrypter) activeKeySetLocked() (*keyset, error) {
if err != nil {
return nil, err
}
if keyMeta == nil {
return nil, fmt.Errorf("keyring has not been initialized yet")
}
return e.keysetByIDLocked(keyMeta.KeyID)
}

View File

@ -294,10 +294,7 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
schedulerConfig := s.getOrCreateSchedulerConfig()
// Create the first root key if it doesn't already exist
err := s.initializeKeyring()
if err != nil {
return err
}
go s.initializeKeyring(stopCh)
// Initialize the ClusterID
_, _ = s.ClusterID()
@ -1966,43 +1963,66 @@ func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration {
return config
}
var minVersionKeyring = version.Must(version.NewVersion("1.4.0"))
// initializeKeyring creates the first root key if the leader doesn't
// already have one. The metadata will be replicated via raft and then
// the followers will get the key material from their own key
// replication.
func (s *Server) initializeKeyring() error {
func (s *Server) initializeKeyring(stopCh <-chan struct{}) {
logger := s.logger.Named("keyring")
store := s.fsm.State()
keyMeta, err := store.GetActiveRootKeyMeta(nil)
if err != nil {
return err
logger.Error("failed to get active key: %v", err)
return
}
if keyMeta != nil {
return nil
return
}
s.logger.Named("core").Trace("initializing keyring")
logger.Trace("verifying cluster is ready to initialize keyring")
for {
select {
case <-stopCh:
return
default:
}
if ServersMeetMinimumVersion(s.serf.Members(), minVersionKeyring, true) {
break
}
}
// we might have lost leadershuip during the version check
if !s.IsLeader() {
return
}
logger.Trace("initializing keyring")
rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM)
rootKey.Meta.SetActive()
if err != nil {
return fmt.Errorf("could not initialize keyring: %v", err)
logger.Error("could not initialize keyring: %v", err)
return
}
err = s.encrypter.AddKey(rootKey)
if err != nil {
return fmt.Errorf("could not add initial key to keyring: %v", err)
logger.Error("could not add initial key to keyring: %v", err)
return
}
if _, _, err = s.raftApply(structs.RootKeyMetaUpsertRequestType,
structs.KeyringUpdateRootKeyMetaRequest{
RootKeyMeta: rootKey.Meta,
}); err != nil {
return fmt.Errorf("could not initialize keyring: %v", err)
logger.Error("could not initialize keyring: %v", err)
return
}
s.logger.Named("core").Info("initialized keyring", "id", rootKey.Meta.KeyID)
return nil
logger.Info("initialized keyring", "id", rootKey.Meta.KeyID)
}
func (s *Server) generateClusterID() (string, error) {

View File

@ -243,7 +243,7 @@ func TestPlanApply_applyPlanWithNormalizedAllocs(t *testing.T) {
ci.Parallel(t)
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.Build = "0.9.2"
c.Build = "1.4.0"
})
defer cleanupS1()
testutil.WaitForLeader(t, s1.RPC)

View File

@ -488,7 +488,7 @@ func TestWorker_SubmitPlanNormalizedAllocations(t *testing.T) {
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.NumSchedulers = 0
c.EnabledSchedulers = []string{structs.JobTypeService}
c.Build = "0.9.2"
c.Build = "1.4.0"
})
defer cleanupS1()
testutil.WaitForLeader(t, s1.RPC)