Rework locking within the PKI CRLBuilder (#15325)

- Do not grab a lock within the requestRebuildIfActiveNode function to avoid issues being called from the invalidate function - Leverage more atmoic operations, and only grab the lock if we are going to perform the rebuild.
2022-05-11 13:14:21 -04:00 · 2022-05-11 13:14:21 -04:00 · 20a78a2118
parent 48f3279c49
commit 20a78a2118
2 changed files with 60 additions and 7 deletions
--- a/builtin/logical/pki/crl_test.go
+++ b/builtin/logical/pki/crl_test.go
@ -154,6 +154,52 @@ func TestBackend_Secondary_CRL_Rebuilding(t *testing.T) {
 	require.Equal(t, 0, len(crl.RevokedCertificates))
 }

+func TestCrlRebuilder(t *testing.T) {
+	ctx := context.Background()
+	b, s := createBackendWithStorage(t)
+	mkc := newManagedKeyContext(ctx, b, "test")
+
+	// Write out the issuer/key to storage without going through the api call as replication would.
+	bundle := genCertBundle(t, b, s)
+	_, _, err := writeCaBundle(mkc, s, bundle, "", "")
+	require.NoError(t, err)
+
+	req := &logical.Request{Storage: s}
+	cb := crlBuilder{}
+
+	// Force an initial build
+	err = cb.rebuild(ctx, b, req, true)
+	require.NoError(t, err, "Failed to rebuild CRL")
+
+	resp := requestCrlFromBackend(t, s, b)
+	crl1 := parseCrlPemBytes(t, resp.Data["http_raw_body"].([]byte))
+
+	// We shouldn't rebuild within this call.
+	err = cb.rebuildIfForced(ctx, b, req)
+	require.NoError(t, err, "Failed to rebuild if forced CRL")
+	resp = requestCrlFromBackend(t, s, b)
+	crl2 := parseCrlPemBytes(t, resp.Data["http_raw_body"].([]byte))
+	require.Equal(t, crl1.ThisUpdate, crl2.ThisUpdate, "According to the update field, we rebuilt the CRL")
+
+	// Make sure we have ticked over to the next second
+	for {
+		diff := time.Now().Sub(crl1.ThisUpdate)
+		if diff.Seconds() >= 1 {
+			break
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+
+	// This should rebuild the CRL
+	cb.requestRebuildIfActiveNode(b)
+	err = cb.rebuildIfForced(ctx, b, req)
+	require.NoError(t, err, "Failed to rebuild if forced CRL")
+	resp = requestCrlFromBackend(t, s, b)
+	crl3 := parseCrlPemBytes(t, resp.Data["http_raw_body"].([]byte))
+	require.True(t, crl1.ThisUpdate.Before(crl3.ThisUpdate),
+		"initial crl time: %#v not before next crl rebuild time: %#v", crl1.ThisUpdate, crl3.ThisUpdate)
+}
+
 func requestCrlFromBackend(t *testing.T, s logical.Storage, b *backend) *logical.Response {
 	crlReq := &logical.Request{
 		Operation: logical.ReadOperation,
--- a/builtin/logical/pki/crl_util.go
+++ b/builtin/logical/pki/crl_util.go
@ -61,7 +61,9 @@ func (cb *crlBuilder) rebuild(ctx context.Context, b *backend, request *logical.

 // requestRebuildIfActiveNode will schedule a rebuild of the CRL from the next read or write api call assuming we are the active node of a cluster
 func (cb *crlBuilder) requestRebuildIfActiveNode(b *backend) {
-	// Only schedule us on active nodes, ignoring secondary nodes, the active can/should rebuild the CRL.
+	// Only schedule us on active nodes, as the active node is the only node that can rebuild/write the CRL.
+	// Note 1: The CRL is cluster specific, so this does need to run on the active node of a performance secondary cluster.
+	// Note 2: This is called by the storage invalidation function, so it should not block.
 	if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
 		b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
 		b.Logger().Debug("Ignoring request to schedule a CRL rebuild, not on active node.")
@ -69,19 +71,24 @@ func (cb *crlBuilder) requestRebuildIfActiveNode(b *backend) {
 	}

 	b.Logger().Info("Scheduling PKI CRL rebuild.")
-	cb.m.Lock()
-	defer cb.m.Unlock()
-	atomic.StoreUint32(&cb.forceRebuild, 1)
+	// Set the flag to 1, we don't care if we aren't the ones that actually swap it to 1.
+	atomic.CompareAndSwapUint32(&cb.forceRebuild, 0, 1)
 }

 func (cb *crlBuilder) _doRebuild(ctx context.Context, b *backend, request *logical.Request, forceNew bool, ignoreForceFlag bool) error {
 	cb.m.Lock()
 	defer cb.m.Unlock()
-	if cb.forceRebuild == 1 || ignoreForceFlag {
-		defer atomic.StoreUint32(&cb.forceRebuild, 0)
+	// Re-read the lock in case someone beat us to the punch between the previous load op.
+	forceBuildFlag := atomic.LoadUint32(&cb.forceRebuild)
+	if forceBuildFlag == 1 || ignoreForceFlag {
+		// Reset our original flag back to 0 before we start the rebuilding. This may lead to another round of
+		// CRL building, but we want to avoid the race condition caused by clearing the flag after we completed (An
+		// update/revocation occurred attempting to set the flag, after we listed the certs but before we wrote
+		// the CRL, so we missed the update and cleared the flag.)
+		atomic.CompareAndSwapUint32(&cb.forceRebuild, 1, 0)

 		// if forceRebuild was requested, that should force a complete rebuild even if requested not too by forceNew
-		myForceNew := cb.forceRebuild == 1 || forceNew
+		myForceNew := forceBuildFlag == 1 || forceNew
 		return buildCRLs(ctx, b, request, myForceNew)
 	}