diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index 42b3e6843..f6e72bc39 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -367,7 +367,28 @@ func (cb *crlBuilder) getPresentLocalDeltaWALForClearing(sc *storageContext) ([] } func (cb *crlBuilder) getPresentUnifiedDeltaWALForClearing(sc *storageContext) ([]string, error) { - return cb._getPresentDeltaWALForClearing(sc, unifiedDeltaWALPath) + walClusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix) + if err != nil { + return nil, fmt.Errorf("error fetching list of clusters with delta WAL entries: %w", err) + } + + var allPaths []string + for index, cluster := range walClusters { + prefix := unifiedDeltaWALPrefix + cluster + clusterPaths, err := cb._getPresentDeltaWALForClearing(sc, prefix) + if err != nil { + return nil, fmt.Errorf("error fetching delta WAL entries for cluster (%v / %v): %w", index, cluster, err) + } + + // Here, we don't want to include the unifiedDeltaWALPrefix because + // clearUnifiedDeltaWAL handles that for us. Instead, just include + // the cluster identifier. + for _, clusterPath := range clusterPaths { + allPaths = append(allPaths, cluster+clusterPath) + } + } + + return allPaths, nil } func (cb *crlBuilder) _clearDeltaWAL(sc *storageContext, walSerials []string, path string) error { @@ -514,49 +535,70 @@ func (cb *crlBuilder) _shouldRebuildUnifiedCRLs(sc *storageContext, override boo return false, nil } + // If we're overriding whether we should build Delta CRLs, always return + // true, even if storage errors might've happen. + if override { + return true, nil + } + // Fetch two storage entries to see if we actually need to do this - // rebuild, given we're within the window. - lastWALEntry, err := sc.Storage.Get(sc.Context, unifiedDeltaWALLastRevokedSerial) - if err != nil || !override && (lastWALEntry == nil || lastWALEntry.Value == nil) { - // If this entry does not exist, we don't need to rebuild the - // delta WAL due to the expiration assumption above. There must - // not have been any new revocations. Since err should be nil - // in this case, we can safely return it. - return false, err - } - - lastBuildEntry, err := sc.Storage.Get(sc.Context, unifiedDeltaWALLastBuildSerial) + // rebuild, given we're within the window. We need to fetch these + // two entries per cluster. + clusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix) if err != nil { - return false, err + return false, fmt.Errorf("failed to get the list of clusters having written Delta WALs: %w", err) } - if !override && lastBuildEntry != nil && lastBuildEntry.Value != nil { - // If the last build entry doesn't exist, we still want to build a - // new delta WAL, since this could be our very first time doing so. - // + // If any cluster tells us to rebuild, we should rebuild. + shouldRebuild := false + for index, cluster := range clusters { + prefix := unifiedDeltaWALPrefix + cluster + clusterUnifiedLastRevokedWALEntry := prefix + deltaWALLastRevokedSerialName + clusterUnifiedLastBuiltWALEntry := prefix + deltaWALLastBuildSerialName + + lastWALEntry, err := sc.Storage.Get(sc.Context, clusterUnifiedLastRevokedWALEntry) + if err != nil { + return false, fmt.Errorf("failed fetching last revoked WAL entry for cluster (%v / %v): %w", index, cluster, err) + } + + if lastWALEntry == nil || lastWALEntry.Value == nil { + continue + } + + lastBuildEntry, err := sc.Storage.Get(sc.Context, clusterUnifiedLastBuiltWALEntry) + if err != nil { + return false, fmt.Errorf("failed fetching last built CRL WAL entry for cluster (%v / %v): %w", index, cluster, err) + } + + if lastBuildEntry == nil || lastBuildEntry.Value == nil { + // If the last build entry doesn't exist, we still want to build a + // new delta WAL, since this could be our very first time doing so. + shouldRebuild = true + break + } + // Otherwise, here, now that we know it exists, we want to check this // value against the other value. Since we previously guarded the WAL // entry being non-empty, we're good to decode everything within this // guard. var walInfo lastWALInfo if err := lastWALEntry.DecodeJSON(&walInfo); err != nil { - return false, err + return false, fmt.Errorf("failed decoding last revoked WAL entry for cluster (%v / %v): %w", index, cluster, err) } var deltaInfo lastDeltaInfo if err := lastBuildEntry.DecodeJSON(&deltaInfo); err != nil { - return false, err + return false, fmt.Errorf("failed decoding last built CRL WAL entry for cluster (%v / %v): %w", index, cluster, err) } - // Here, everything decoded properly and we know that no new certs - // have been revoked since we built this last delta CRL. We can exit - // without rebuilding then. - if walInfo.Serial == deltaInfo.Serial { - return false, nil + if walInfo.Serial != deltaInfo.Serial { + shouldRebuild = true + break } } - return true, nil + // No errors occurred, so return the result. + return shouldRebuild, nil } func (cb *crlBuilder) rebuildDeltaCRLs(sc *storageContext, forceNew bool) error { @@ -982,8 +1024,20 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( } sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key) + // From here on out, the certificate has been revoked locally. Any other + // persistence issues might still err, but any other failure messages + // should be added as warnings to the revocation. + resp := &logical.Response{ + Data: map[string]interface{}{ + "revocation_time": revInfo.RevocationTime, + "revocation_time_rfc3339": revInfo.RevocationTimeUTC.Format(time.RFC3339Nano), + "state": "revoked", + }, + } + // If this flag is enabled after the fact, existing local entries will be published to // the unified storage space through a periodic function. + failedWritingUnifiedCRL := false if config.UnifiedCRL { entry := &unifiedRevocationEntry{ SerialNumber: colonSerial, @@ -999,6 +1053,9 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( sc.Backend.Logger().Error("Failed to write unified revocation entry, will re-attempt later", "serial_number", colonSerial, "error", ignoreErr) sc.Backend.unifiedTransferStatus.forceRun() + + resp.AddWarning(fmt.Sprintf("Failed to write unified revocation entry, will re-attempt later: %v", err)) + failedWritingUnifiedCRL = true } } @@ -1017,26 +1074,20 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( } } } else if config.EnableDelta { - if err := writeRevocationDeltaWALs(sc, config, hyphenSerial, colonSerial); err != nil { + if err := writeRevocationDeltaWALs(sc, config, resp, failedWritingUnifiedCRL, hyphenSerial, colonSerial); err != nil { return nil, fmt.Errorf("failed to write WAL entries for Delta CRLs: %w", err) } } - return &logical.Response{ - Data: map[string]interface{}{ - "revocation_time": revInfo.RevocationTime, - "revocation_time_rfc3339": revInfo.RevocationTimeUTC.Format(time.RFC3339Nano), - "state": "revoked", - }, - }, nil + return resp, nil } -func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, hyphenSerial string, colonSerial string) error { +func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, resp *logical.Response, failedWritingUnifiedCRL bool, hyphenSerial string, colonSerial string) error { if err := writeSpecificRevocationDeltaWALs(sc, hyphenSerial, colonSerial, localDeltaWALPath); err != nil { return fmt.Errorf("failed to write local delta WAL entry: %w", err) } - if config.UnifiedCRL { + if config.UnifiedCRL && !failedWritingUnifiedCRL { // We only need to write cross-cluster unified Delta WAL entries when // it is enabled; in particular, because we rebuild CRLs when enabling // this flag, any revocations that happened prior to enabling unified @@ -1046,13 +1097,21 @@ func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, hyphenSeria // listing for the unified CRL rebuild, this revocation will not // appear on either the main or the next delta CRL, but will need to // wait for a subsequent complete CRL rebuild). + // + // Lastly, we don't attempt this if the unified CRL entry failed to + // write, as we need that entry before the delta WAL entry will make + // sense. if ignoredErr := writeSpecificRevocationDeltaWALs(sc, hyphenSerial, colonSerial, unifiedDeltaWALPath); ignoredErr != nil { // Just log the error if we fail to write across clusters, a separate background // thread will reattempt it later on as we have the local write done. sc.Backend.Logger().Error("Failed to write cross-cluster delta WAL entry, will re-attempt later", "serial_number", colonSerial, "error", ignoredErr) sc.Backend.unifiedTransferStatus.forceRun() + + resp.AddWarning(fmt.Sprintf("Failed to write cross-cluster delta WAL entry, will re-attempt later: %v", ignoredErr)) } + } else if failedWritingUnifiedCRL { + resp.AddWarning("Skipping cross-cluster delta WAL entry as cross-cluster revocation failed to write; will re-attempt later.") } return nil @@ -1275,7 +1334,7 @@ func buildAnyCRLs(sc *storageContext, forceNew bool, isDelta bool) error { } func getLastWALSerial(sc *storageContext, path string) (string, error) { - lastWALEntry, err := sc.Storage.Get(sc.Context, localDeltaWALLastRevokedSerial) + lastWALEntry, err := sc.Storage.Get(sc.Context, path) if err != nil { return "", err } @@ -1438,11 +1497,23 @@ func buildAnyUnifiedCRLs( // (and potentially more) in it; when we're done writing the delta CRL, // we'll write this serial as a sentinel to see if we need to rebuild it // in the future. - var lastDeltaSerial string + // + // We need to do this per-cluster. + lastDeltaSerial := map[string]string{} if isDelta { - lastDeltaSerial, err = getLastWALSerial(sc, unifiedDeltaWALLastRevokedSerial) + clusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix) if err != nil { - return nil, err + return nil, fmt.Errorf("error listing clusters for unified delta WAL building: %w", err) + } + + for index, cluster := range clusters { + path := unifiedDeltaWALPrefix + cluster + deltaWALLastRevokedSerialName + serial, err := getLastWALSerial(sc, path) + if err != nil { + return nil, fmt.Errorf("error getting last written Delta WAL serial for cluster (%v / %v): %w", index, cluster, err) + } + + lastDeltaSerial[cluster] = serial } } @@ -1513,12 +1584,20 @@ func buildAnyUnifiedCRLs( // for a while. sc.Backend.crlBuilder.lastDeltaRebuildCheck = time.Now() - if len(lastDeltaSerial) > 0 { - // When we have a last delta serial, write out the relevant info - // so we can skip extra CRL rebuilds. - deltaInfo := lastDeltaInfo{Serial: lastDeltaSerial} + // Persist all of our known last revoked serial numbers here, as the + // last seen serial during build. This will allow us to detect if any + // new revocations have occurred, forcing us to rebuild the delta CRL. + for cluster, serial := range lastDeltaSerial { + if len(serial) == 0 { + continue + } - lastDeltaBuildEntry, err := logical.StorageEntryJSON(unifiedDeltaWALLastBuildSerial, deltaInfo) + // Make sure to use the cluster-specific path. Since we're on the + // active node of the primary cluster, we own this entry and can + // safely write it. + path := unifiedDeltaWALPrefix + cluster + deltaWALLastBuildSerialName + deltaInfo := lastDeltaInfo{Serial: serial} + lastDeltaBuildEntry, err := logical.StorageEntryJSON(path, deltaInfo) if err != nil { return nil, fmt.Errorf("error creating last delta CRL rebuild serial entry: %w", err) } diff --git a/builtin/logical/pki/periodic.go b/builtin/logical/pki/periodic.go index 18032fbef..b8119e0cd 100644 --- a/builtin/logical/pki/periodic.go +++ b/builtin/logical/pki/periodic.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/vault/sdk/helper/consts" + "github.com/hashicorp/vault/sdk/logical" ) const ( @@ -90,7 +91,16 @@ func runUnifiedTransfer(sc *storageContext) { if err != nil { b.Logger().Error("an error occurred running unified transfer", "error", err.Error()) status.forceRerun.Store(true) + } else { + if config.EnableDelta { + err = doUnifiedTransferMissingDeltaWALSerials(sc, clusterId) + if err != nil { + b.Logger().Error("an error occurred running unified transfer", "error", err.Error()) + status.forceRerun.Store(true) + } + } } + status.lastRun = time.Now() } @@ -122,7 +132,7 @@ func doUnifiedTransferMissingLocalSerials(sc *storageContext, clusterId string) err := readRevocationEntryAndTransfer(sc, serialNum) if err != nil { errCount++ - sc.Backend.Logger().Debug("Failed transferring local revocation to unified space", + sc.Backend.Logger().Error("Failed transferring local revocation to unified space", "serial", serialNum, "error", err) } } @@ -135,6 +145,152 @@ func doUnifiedTransferMissingLocalSerials(sc *storageContext, clusterId string) return nil } +func doUnifiedTransferMissingDeltaWALSerials(sc *storageContext, clusterId string) error { + // We need to do a similar thing for Delta WAL entry certificates. + // When the delta WAL failed to write for one or more entries, + // we'll need to replicate these up to the primary cluster. When it + // has performed a new delta WAL build, it will empty storage and + // update to a last written WAL entry that exceeds what we've seen + // locally. + thisUnifiedWALEntryPath := unifiedDeltaWALPath + deltaWALLastRevokedSerialName + lastUnifiedWALEntry, err := getLastWALSerial(sc, thisUnifiedWALEntryPath) + if err != nil { + return fmt.Errorf("failed to fetch last cross-cluster unified revoked delta WAL serial number: %w", err) + } + + lastLocalWALEntry, err := getLastWALSerial(sc, localDeltaWALLastRevokedSerial) + if err != nil { + return fmt.Errorf("failed to fetch last locally revoked delta WAL serial number: %w", err) + } + + // We now need to transfer all the entries and then write the last WAL + // entry at the end. Start by listing all certificates; any missing + // certificates will be copied over and then the WAL entry will be + // updated once. + // + // We do not delete entries either locally or remotely, as either + // cluster could've rebuilt delta CRLs with out-of-sync information, + // removing some entries (and, we cannot differentiate between these + // two cases). On next full CRL rebuild (on either cluster), the state + // should get synchronized, and future delta CRLs after this function + // returns without issue will see the remaining entries. + // + // Lastly, we need to ensure we don't accidentally write any unified + // delta WAL entries that aren't present in the main cross-cluster + // revoked storage location. This would mean the above function failed + // to copy them for some reason, despite them presumably appearing + // locally. + _unifiedWALEntries, err := sc.Storage.List(sc.Context, unifiedDeltaWALPath) + if err != nil { + return fmt.Errorf("failed to list cross-cluster unified delta WAL storage: %w", err) + } + unifiedWALEntries := sliceToMapKey(_unifiedWALEntries) + + _unifiedRevokedSerials, err := listClusterSpecificUnifiedRevokedCerts(sc, clusterId) + if err != nil { + return fmt.Errorf("failed to list cross-cluster revoked certificates: %w", err) + } + unifiedRevokedSerials := sliceToMapKey(_unifiedRevokedSerials) + + localWALEntries, err := sc.Storage.List(sc.Context, localDeltaWALPath) + if err != nil { + return fmt.Errorf("failed to list local delta WAL storage: %w", err) + } + + if lastUnifiedWALEntry == lastLocalWALEntry && len(_unifiedWALEntries) == len(localWALEntries) { + // Writing the last revoked WAL entry is the last thing that we do. + // Because these entries match (across clusters) and we have the same + // number of entries, assume we don't have anything to sync and exit + // early. + // + // We need both checks as, in the event of PBPWF failing and then + // returning while more revocations are happening, we could have + // been schedule to run, but then skip running (if only the first + // condition was checked) because a later revocation succeeded + // in writing a unified WAL entry, before we started replicating + // the rest back up. + // + // The downside of this approach is that, if the main cluster + // does a full rebuild in the mean time, we could re-sync more + // entries back up to the primary cluster that are already + // included in the complete CRL. Users can manually rebuild the + // full CRL (clearing these duplicate delta CRL entries) if this + // affects them. + return nil + } + + errCount := 0 + for index, serial := range localWALEntries { + if index%25 == 0 { + config, _ := sc.Backend.crlBuilder.getConfigWithUpdate(sc) + if config != nil && (!config.UnifiedCRL || !config.EnableDelta) { + return errors.New("unified or delta CRLs have been disabled after we started, stopping") + } + } + + if serial == deltaWALLastBuildSerialName || serial == deltaWALLastRevokedSerialName { + // Skip our special serial numbers. + continue + } + + _, isAlreadyPresent := unifiedWALEntries[serial] + if isAlreadyPresent { + // Serial exists on both local and unified cluster. We're + // presuming we don't need to read and re-write these entries + // and that only missing entries need to be updated. + continue + } + + _, isRevokedCopied := unifiedRevokedSerials[serial] + if !isRevokedCopied { + // We need to wait here to copy over. + errCount += 1 + sc.Backend.Logger().Debug("Delta WAL exists locally, but corresponding cross-cluster full revocation entry is missing; skipping", "serial", serial) + continue + } + + // All good: read the local entry and write to the remote variant. + localPath := localDeltaWALPath + serial + unifiedPath := unifiedDeltaWALPath + serial + + entry, err := sc.Storage.Get(sc.Context, localPath) + if err != nil || entry == nil { + errCount += 1 + sc.Backend.Logger().Error("Failed reading local delta WAL entry to copy to cross-cluster", "serial", serial, "err", err) + continue + } + + entry.Key = unifiedPath + err = sc.Storage.Put(sc.Context, entry) + if err != nil { + errCount += 1 + sc.Backend.Logger().Error("Failed sync local delta WAL entry to cross-cluster unified delta WAL location", "serial", serial, "err", err) + continue + } + } + + if errCount > 0 { + // See note above about why we don't fail here. + sc.Backend.Logger().Warn(fmt.Sprintf("Failed transfering %d local delta WAL serials to unified storage", errCount)) + return nil + } + + // Everything worked. Here, we can write over the delta WAL last revoked + // value. By using the earlier value, even if new revocations have + // occurred, we ensure any further missing entries can be handled in the + // next round. + lastRevSerial := lastWALInfo{Serial: lastLocalWALEntry} + lastWALEntry, err := logical.StorageEntryJSON(thisUnifiedWALEntryPath, lastRevSerial) + if err != nil { + return fmt.Errorf("unable to create cross-cluster unified last delta CRL WAL entry: %w", err) + } + if err = sc.Storage.Put(sc.Context, lastWALEntry); err != nil { + return fmt.Errorf("error saving cross-cluster unified last delta CRL WAL entry: %w", err) + } + + return nil +} + func readRevocationEntryAndTransfer(sc *storageContext, serial string) error { hyphenSerial := normalizeSerial(serial) revInfo, err := sc.fetchRevocationInfo(hyphenSerial) diff --git a/changelog/20058.txt b/changelog/20058.txt new file mode 100644 index 000000000..e43a1f4ad --- /dev/null +++ b/changelog/20058.txt @@ -0,0 +1,3 @@ +```release-note:bug +secrets/pki: Fix building of unified delta CRLs and recovery during unified delta WAL write failures. +```