From 3075c5bd6577fdf45bed9bd5ce4920ce38479afe Mon Sep 17 00:00:00 2001 From: Nick Cabatoff Date: Fri, 9 Sep 2022 12:19:57 -0400 Subject: [PATCH] Do not attempt to write a new TLS keyring at startup if raft is already setup (#17079) --- changelog/17079.txt | 2 + vault/raft.go | 110 ++++++++++++++++++++++---------------------- 2 files changed, 58 insertions(+), 54 deletions(-) create mode 100644 changelog/17079.txt diff --git a/changelog/17079.txt b/changelog/17079.txt new file mode 100644 index 000000000..1f3d7b14e --- /dev/null +++ b/changelog/17079.txt @@ -0,0 +1,2 @@ +```release-note:bug +storage/raft: Fix error writing raft TLS keyring during follower joins \ No newline at end of file diff --git a/vault/raft.go b/vault/raft.go index 3d6b9c4c7..a081325ba 100644 --- a/vault/raft.go +++ b/vault/raft.go @@ -73,72 +73,74 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) { return nil } - // Retrieve the raft TLS information - raftTLSEntry, err := c.barrier.Get(ctx, raftTLSStoragePath) - if err != nil { - return err - } - var creating bool var raftTLS *raft.TLSKeyring - switch raftTLSEntry { - case nil: - // If this is HA-only and no TLS keyring is found, that means the - // cluster has not been bootstrapped or joined. We return early here in - // this case. If we return here, the raft object has not been instantiated, - // and a bootstrap call should be made. - if c.isRaftHAOnly() { - c.logger.Trace("skipping raft backend setup during unseal, no bootstrap operation has been started yet") - return nil - } - - // If we did not find a TLS keyring we will attempt to create one here. - // This happens after a storage migration process. This node is also - // marked to start as leader so we can write the new TLS Key. This is an - // error condition if there are already multiple nodes in the cluster, - // and the below storage write will fail. If the cluster is somehow in - // this state the unseal will fail and a cluster recovery will need to - // be done. - creating = true - raftTLSKey, err := raft.GenerateTLSKey(c.secureRandomReader) + if !raftBackend.Initialized() { + // Retrieve the raft TLS information + raftTLSEntry, err := c.barrier.Get(ctx, raftTLSStoragePath) if err != nil { return err } - raftTLS = &raft.TLSKeyring{ - Keys: []*raft.TLSKey{raftTLSKey}, - ActiveKeyID: raftTLSKey.ID, + switch raftTLSEntry { + case nil: + // If this is HA-only and no TLS keyring is found, that means the + // cluster has not been bootstrapped or joined. We return early here in + // this case. If we return here, the raft object has not been instantiated, + // and a bootstrap call should be made. + if c.isRaftHAOnly() { + c.logger.Trace("skipping raft backend setup during unseal, no bootstrap operation has been started yet") + return nil + } + + // If we did not find a TLS keyring we will attempt to create one here. + // This happens after a storage migration process. This node is also + // marked to start as leader so we can write the new TLS Key. This is an + // error condition if there are already multiple nodes in the cluster, + // and the below storage write will fail. If the cluster is somehow in + // this state the unseal will fail and a cluster recovery will need to + // be done. + creating = true + raftTLSKey, err := raft.GenerateTLSKey(c.secureRandomReader) + if err != nil { + return err + } + + raftTLS = &raft.TLSKeyring{ + Keys: []*raft.TLSKey{raftTLSKey}, + ActiveKeyID: raftTLSKey.ID, + } + default: + raftTLS = new(raft.TLSKeyring) + if err := raftTLSEntry.DecodeJSON(raftTLS); err != nil { + return err + } } - default: - raftTLS = new(raft.TLSKeyring) - if err := raftTLSEntry.DecodeJSON(raftTLS); err != nil { + + hasState, err := raftBackend.HasState() + if err != nil { return err } - } - hasState, err := raftBackend.HasState() - if err != nil { - return err - } + // This can be hit on follower nodes that got their config updated to use + // raft for HA-only before they are joined to the cluster. Since followers + // in this case use shared storage, it doesn't return early from the TLS + // case above, but there's not raft state yet for the backend to call + // raft.SetupCluster. + if !hasState { + c.logger.Trace("skipping raft backend setup during unseal, no raft state found") + return nil + } - // This can be hit on follower nodes that got their config updated to use - // raft for HA-only before they are joined to the cluster. Since followers - // in this case use shared storage, it doesn't return early from the TLS - // case above, but there's not raft state yet for the backend to call - // raft.SetupCluster. - if !hasState { - c.logger.Trace("skipping raft backend setup during unseal, no raft state found") - return nil - } + raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true)) - raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true)) - - if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{ - TLSKeyring: raftTLS, - ClusterListener: c.getClusterListener(), - StartAsLeader: creating, - }); err != nil { - return err + if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{ + TLSKeyring: raftTLS, + ClusterListener: c.getClusterListener(), + StartAsLeader: creating, + }); err != nil { + return err + } } defer func() {