diff --git a/.changelog/15227.txt b/.changelog/15227.txt new file mode 100644 index 000000000..a22d4bc7a --- /dev/null +++ b/.changelog/15227.txt @@ -0,0 +1,3 @@ +```release-note:bug +keyring: Fixed a bug where replication would stop after snapshot restores +``` diff --git a/nomad/encrypter.go b/nomad/encrypter.go index 23b53db72..1f773bd95 100644 --- a/nomad/encrypter.go +++ b/nomad/encrypter.go @@ -431,18 +431,13 @@ func (krr *KeyringReplicator) stop() { krr.stopFn() } -const keyringReplicationRate = 10 +const keyringReplicationRate = 5 func (krr *KeyringReplicator) run(ctx context.Context) { - limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate) krr.logger.Debug("starting encryption key replication") defer krr.logger.Debug("exiting key replication") - retryErrTimer, stop := helper.NewSafeTimer(time.Second * 1) - defer stop() - -START: - store := krr.srv.fsm.State() + limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate) for { select { @@ -451,19 +446,17 @@ START: case <-ctx.Done(): return default: - // Rate limit how often we attempt replication err := limiter.Wait(ctx) if err != nil { - goto ERR_WAIT // rate limit exceeded + continue // rate limit exceeded } - ws := store.NewWatchSet() - iter, err := store.RootKeyMetas(ws) + store := krr.srv.fsm.State() + iter, err := store.RootKeyMetas(nil) if err != nil { krr.logger.Error("failed to fetch keyring", "error", err) - goto ERR_WAIT + continue } - for { raw := iter.Next() if raw == nil { @@ -488,16 +481,6 @@ START: } } -ERR_WAIT: - retryErrTimer.Reset(1 * time.Second) - - select { - case <-retryErrTimer.C: - goto START - case <-ctx.Done(): - return - } - } // replicateKey replicates a single key from peer servers that was present in diff --git a/nomad/encrypter_test.go b/nomad/encrypter_test.go index 43766d8c4..b16e4eae6 100644 --- a/nomad/encrypter_test.go +++ b/nomad/encrypter_test.go @@ -1,6 +1,7 @@ package nomad import ( + "bytes" "context" "os" "path/filepath" @@ -8,6 +9,7 @@ import ( "time" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" "github.com/hashicorp/nomad/ci" @@ -300,6 +302,32 @@ func TestEncrypter_KeyringReplication(t *testing.T) { time.Second*5, time.Second, "expected new servers to get replicated keys") + // Scenario: reload a snapshot + + t.Logf("taking snapshot of node5") + + snapshot, err := srv5.fsm.Snapshot() + must.NoError(t, err) + + defer snapshot.Release() + + // Persist so we can read it back + buf := bytes.NewBuffer(nil) + sink := &MockSink{buf, false} + must.NoError(t, snapshot.Persist(sink)) + + must.NoError(t, srv5.fsm.Restore(sink)) + + // rotate the key + + err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) + require.NoError(t, err) + keyID4 := rotateResp.Key.KeyID + + require.Eventually(t, checkReplicationFn(keyID4), + time.Second*5, time.Second, + "expected new servers to get replicated keys after snapshot restore") + } func TestEncrypter_EncryptDecrypt(t *testing.T) {