From 80ec5e13466ca3117b5927f9952ffc7ffaa7b46b Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 6 Oct 2022 12:47:02 -0400 Subject: [PATCH] fix panic from keyring raft entries being written during upgrade (#14821) During an upgrade to Nomad 1.4.0, if a server running 1.4.0 becomes the leader before one of the 1.3.x servers, the old server will crash because the keyring is initialized and writes a raft entry. Wait until all members are on a version that supports the keyring before initializing it. --- .changelog/14821.txt | 3 +++ nomad/encrypter.go | 3 +++ nomad/leader.go | 46 ++++++++++++++++++++++++++++------------ nomad/plan_apply_test.go | 2 +- nomad/worker_test.go | 2 +- 5 files changed, 41 insertions(+), 15 deletions(-) create mode 100644 .changelog/14821.txt diff --git a/.changelog/14821.txt b/.changelog/14821.txt new file mode 100644 index 000000000..162f96100 --- /dev/null +++ b/.changelog/14821.txt @@ -0,0 +1,3 @@ +```release-note:bug +keyring: Fixed a panic that can occur during upgrades to 1.4.0 when initializing the keyring +``` diff --git a/nomad/encrypter.go b/nomad/encrypter.go index 778715414..8b3851ff9 100644 --- a/nomad/encrypter.go +++ b/nomad/encrypter.go @@ -277,6 +277,9 @@ func (e *Encrypter) activeKeySetLocked() (*keyset, error) { if err != nil { return nil, err } + if keyMeta == nil { + return nil, fmt.Errorf("keyring has not been initialized yet") + } return e.keysetByIDLocked(keyMeta.KeyID) } diff --git a/nomad/leader.go b/nomad/leader.go index 950931ce9..4d82fd689 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -294,10 +294,7 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error { schedulerConfig := s.getOrCreateSchedulerConfig() // Create the first root key if it doesn't already exist - err := s.initializeKeyring() - if err != nil { - return err - } + go s.initializeKeyring(stopCh) // Initialize the ClusterID _, _ = s.ClusterID() @@ -1966,43 +1963,66 @@ func (s *Server) getOrCreateSchedulerConfig() *structs.SchedulerConfiguration { return config } +var minVersionKeyring = version.Must(version.NewVersion("1.4.0")) + // initializeKeyring creates the first root key if the leader doesn't // already have one. The metadata will be replicated via raft and then // the followers will get the key material from their own key // replication. -func (s *Server) initializeKeyring() error { +func (s *Server) initializeKeyring(stopCh <-chan struct{}) { + + logger := s.logger.Named("keyring") store := s.fsm.State() keyMeta, err := store.GetActiveRootKeyMeta(nil) if err != nil { - return err + logger.Error("failed to get active key: %v", err) + return } if keyMeta != nil { - return nil + return } - s.logger.Named("core").Trace("initializing keyring") + logger.Trace("verifying cluster is ready to initialize keyring") + for { + select { + case <-stopCh: + return + default: + } + if ServersMeetMinimumVersion(s.serf.Members(), minVersionKeyring, true) { + break + } + } + // we might have lost leadershuip during the version check + if !s.IsLeader() { + return + } + + logger.Trace("initializing keyring") rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) rootKey.Meta.SetActive() if err != nil { - return fmt.Errorf("could not initialize keyring: %v", err) + logger.Error("could not initialize keyring: %v", err) + return } err = s.encrypter.AddKey(rootKey) if err != nil { - return fmt.Errorf("could not add initial key to keyring: %v", err) + logger.Error("could not add initial key to keyring: %v", err) + return } if _, _, err = s.raftApply(structs.RootKeyMetaUpsertRequestType, structs.KeyringUpdateRootKeyMetaRequest{ RootKeyMeta: rootKey.Meta, }); err != nil { - return fmt.Errorf("could not initialize keyring: %v", err) + logger.Error("could not initialize keyring: %v", err) + return } - s.logger.Named("core").Info("initialized keyring", "id", rootKey.Meta.KeyID) - return nil + logger.Info("initialized keyring", "id", rootKey.Meta.KeyID) } func (s *Server) generateClusterID() (string, error) { diff --git a/nomad/plan_apply_test.go b/nomad/plan_apply_test.go index bb9523476..fc21bc6e2 100644 --- a/nomad/plan_apply_test.go +++ b/nomad/plan_apply_test.go @@ -243,7 +243,7 @@ func TestPlanApply_applyPlanWithNormalizedAllocs(t *testing.T) { ci.Parallel(t) s1, cleanupS1 := TestServer(t, func(c *Config) { - c.Build = "0.9.2" + c.Build = "1.4.0" }) defer cleanupS1() testutil.WaitForLeader(t, s1.RPC) diff --git a/nomad/worker_test.go b/nomad/worker_test.go index d8d5f4481..a72304db9 100644 --- a/nomad/worker_test.go +++ b/nomad/worker_test.go @@ -488,7 +488,7 @@ func TestWorker_SubmitPlanNormalizedAllocations(t *testing.T) { s1, cleanupS1 := TestServer(t, func(c *Config) { c.NumSchedulers = 0 c.EnabledSchedulers = []string{structs.JobTypeService} - c.Build = "0.9.2" + c.Build = "1.4.0" }) defer cleanupS1() testutil.WaitForLeader(t, s1.RPC)