keyring: update handle to state inside replication loop (#15227)

* keyring: update handle to state inside replication loop

When keyring replication starts, we take a handle to the state store. But
whenever a snapshot is restored, this handle is invalidated and no longer points
to a state store that is receiving new keys. This leaks a bunch of memory too!

In addition to operator-initiated restores, when fresh servers are added to
existing clusters with large-enough state, the keyring replication can get
started quickly enough that it's running before the snapshot from the existing
clusters have been restored.

Fix this by updating the handle to the state store on each pass.
This commit is contained in:
Tim Gross 2022-11-17 08:40:12 -05:00 committed by GitHub
parent c94c231c08
commit dd3a07302e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 23 deletions

3
.changelog/15227.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
keyring: Fixed a bug where replication would stop after snapshot restores
```

View File

@ -431,18 +431,13 @@ func (krr *KeyringReplicator) stop() {
krr.stopFn()
}
const keyringReplicationRate = 10
const keyringReplicationRate = 5
func (krr *KeyringReplicator) run(ctx context.Context) {
limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate)
krr.logger.Debug("starting encryption key replication")
defer krr.logger.Debug("exiting key replication")
retryErrTimer, stop := helper.NewSafeTimer(time.Second * 1)
defer stop()
START:
store := krr.srv.fsm.State()
limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate)
for {
select {
@ -451,19 +446,17 @@ START:
case <-ctx.Done():
return
default:
// Rate limit how often we attempt replication
err := limiter.Wait(ctx)
if err != nil {
goto ERR_WAIT // rate limit exceeded
continue // rate limit exceeded
}
ws := store.NewWatchSet()
iter, err := store.RootKeyMetas(ws)
store := krr.srv.fsm.State()
iter, err := store.RootKeyMetas(nil)
if err != nil {
krr.logger.Error("failed to fetch keyring", "error", err)
goto ERR_WAIT
continue
}
for {
raw := iter.Next()
if raw == nil {
@ -488,16 +481,6 @@ START:
}
}
ERR_WAIT:
retryErrTimer.Reset(1 * time.Second)
select {
case <-retryErrTimer.C:
goto START
case <-ctx.Done():
return
}
}
// replicateKey replicates a single key from peer servers that was present in

View File

@ -1,6 +1,7 @@
package nomad
import (
"bytes"
"context"
"os"
"path/filepath"
@ -8,6 +9,7 @@ import (
"time"
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
"github.com/hashicorp/nomad/ci"
@ -300,6 +302,32 @@ func TestEncrypter_KeyringReplication(t *testing.T) {
time.Second*5, time.Second,
"expected new servers to get replicated keys")
// Scenario: reload a snapshot
t.Logf("taking snapshot of node5")
snapshot, err := srv5.fsm.Snapshot()
must.NoError(t, err)
defer snapshot.Release()
// Persist so we can read it back
buf := bytes.NewBuffer(nil)
sink := &MockSink{buf, false}
must.NoError(t, snapshot.Persist(sink))
must.NoError(t, srv5.fsm.Restore(sink))
// rotate the key
err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp)
require.NoError(t, err)
keyID4 := rotateResp.Key.KeyID
require.Eventually(t, checkReplicationFn(keyID4),
time.Second*5, time.Second,
"expected new servers to get replicated keys after snapshot restore")
}
func TestEncrypter_EncryptDecrypt(t *testing.T) {