From 73a05ebbe5cb41d89b50f74d6419560c7b5d2a78 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Tue, 11 Apr 2023 14:02:58 -0400 Subject: [PATCH] Fix building unified delta WAL, unified delta CRLs (#20058) * Correctly find certificates for unified delta CRL When building the unified delta CRL, WAL entries from the non-primary cluster were ignored. This resulted in an incomplete delta CRL, preventing some entries from appearing. Signed-off-by: Alexander Scheel * Correctly rebuild unified delta CRLs When deciding if the Unified Delta CRL should be rebuilt, we need to check the status of all clusters and their last revoked serial numbers. If any new serial has been revoked on any cluster, we should rebuild the unified delta CRLs. Signed-off-by: Alexander Scheel * Correctly persist Unified Delta CRL build entries When building the unified CRL, we need to read the last seen serial number from all clusters, not just the present cluster, and write it to the last built serial for that cluster's unified delta WAL entry. This prevents us from continuously rebuilding unified CRLs now that we have fixed our rebuild heuristic. Signed-off-by: Alexander Scheel * Fix getLastWALSerial for unified delta CRLs getLastWALSerial ignored its path argument, preventing it from reading the specified cluster-specific WAL entry. On the primary cluster, this was mostly equivalent, but now that we're correctly reading WAL entries and revocations for other clusters, we need to handle reading these entries correctly. Signed-off-by: Alexander Scheel * Copy delta WAL entries in event of failure Any local delta WAL should be persisted to unified delta WAL space as well. If such unified persistence fails, we need to ensure that they get eventually moved up, otherwise they'll remain missing until the next full CRL rebuild occurs, which might be significantly longer than when the next delta CRL rebuild would otherwise occur. runUnifiedTransfer already handles this for us, but it lacked logic for delta WAL serials. The only interesting catch here is that we refuse to copy any entries whose full unified revocation entry has not also been written. Signed-off-by: Alexander Scheel * Make doUnifiedTransferMissingLocalSerials log an error This message is mostly an error and would always be helpful information to have when troubleshooting failures. Signed-off-by: Alexander Scheel * Warn on cross-cluster write failures during revoke When revoking certificates, we log cross-cluster revocation failures, but we should really expose this information to the caller, that their local revocation was successful, but their cross-cluster revocation failed. Signed-off-by: Alexander Scheel * Ensure unified delta WAL entry has full entry Delta WAL entries are empty files whose only information (a revoked serial number) is contained in the file path. These depend implicitly on a full revocation entry existing for this file (whether a cross-cluster unified entry or a local entry). We should not write unified delta WAL entries without the corresponding full unified revocation entry existing. Add a warning in this case. Signed-off-by: Alexander Scheel * Add changelog entry Signed-off-by: Alexander Scheel --------- Signed-off-by: Alexander Scheel --- builtin/logical/pki/crl_util.go | 167 +++++++++++++++++++++++--------- builtin/logical/pki/periodic.go | 158 +++++++++++++++++++++++++++++- changelog/20058.txt | 3 + 3 files changed, 283 insertions(+), 45 deletions(-) create mode 100644 changelog/20058.txt diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index 42b3e6843..f6e72bc39 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -367,7 +367,28 @@ func (cb *crlBuilder) getPresentLocalDeltaWALForClearing(sc *storageContext) ([] } func (cb *crlBuilder) getPresentUnifiedDeltaWALForClearing(sc *storageContext) ([]string, error) { - return cb._getPresentDeltaWALForClearing(sc, unifiedDeltaWALPath) + walClusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix) + if err != nil { + return nil, fmt.Errorf("error fetching list of clusters with delta WAL entries: %w", err) + } + + var allPaths []string + for index, cluster := range walClusters { + prefix := unifiedDeltaWALPrefix + cluster + clusterPaths, err := cb._getPresentDeltaWALForClearing(sc, prefix) + if err != nil { + return nil, fmt.Errorf("error fetching delta WAL entries for cluster (%v / %v): %w", index, cluster, err) + } + + // Here, we don't want to include the unifiedDeltaWALPrefix because + // clearUnifiedDeltaWAL handles that for us. Instead, just include + // the cluster identifier. + for _, clusterPath := range clusterPaths { + allPaths = append(allPaths, cluster+clusterPath) + } + } + + return allPaths, nil } func (cb *crlBuilder) _clearDeltaWAL(sc *storageContext, walSerials []string, path string) error { @@ -514,49 +535,70 @@ func (cb *crlBuilder) _shouldRebuildUnifiedCRLs(sc *storageContext, override boo return false, nil } + // If we're overriding whether we should build Delta CRLs, always return + // true, even if storage errors might've happen. + if override { + return true, nil + } + // Fetch two storage entries to see if we actually need to do this - // rebuild, given we're within the window. - lastWALEntry, err := sc.Storage.Get(sc.Context, unifiedDeltaWALLastRevokedSerial) - if err != nil || !override && (lastWALEntry == nil || lastWALEntry.Value == nil) { - // If this entry does not exist, we don't need to rebuild the - // delta WAL due to the expiration assumption above. There must - // not have been any new revocations. Since err should be nil - // in this case, we can safely return it. - return false, err - } - - lastBuildEntry, err := sc.Storage.Get(sc.Context, unifiedDeltaWALLastBuildSerial) + // rebuild, given we're within the window. We need to fetch these + // two entries per cluster. + clusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix) if err != nil { - return false, err + return false, fmt.Errorf("failed to get the list of clusters having written Delta WALs: %w", err) } - if !override && lastBuildEntry != nil && lastBuildEntry.Value != nil { - // If the last build entry doesn't exist, we still want to build a - // new delta WAL, since this could be our very first time doing so. - // + // If any cluster tells us to rebuild, we should rebuild. + shouldRebuild := false + for index, cluster := range clusters { + prefix := unifiedDeltaWALPrefix + cluster + clusterUnifiedLastRevokedWALEntry := prefix + deltaWALLastRevokedSerialName + clusterUnifiedLastBuiltWALEntry := prefix + deltaWALLastBuildSerialName + + lastWALEntry, err := sc.Storage.Get(sc.Context, clusterUnifiedLastRevokedWALEntry) + if err != nil { + return false, fmt.Errorf("failed fetching last revoked WAL entry for cluster (%v / %v): %w", index, cluster, err) + } + + if lastWALEntry == nil || lastWALEntry.Value == nil { + continue + } + + lastBuildEntry, err := sc.Storage.Get(sc.Context, clusterUnifiedLastBuiltWALEntry) + if err != nil { + return false, fmt.Errorf("failed fetching last built CRL WAL entry for cluster (%v / %v): %w", index, cluster, err) + } + + if lastBuildEntry == nil || lastBuildEntry.Value == nil { + // If the last build entry doesn't exist, we still want to build a + // new delta WAL, since this could be our very first time doing so. + shouldRebuild = true + break + } + // Otherwise, here, now that we know it exists, we want to check this // value against the other value. Since we previously guarded the WAL // entry being non-empty, we're good to decode everything within this // guard. var walInfo lastWALInfo if err := lastWALEntry.DecodeJSON(&walInfo); err != nil { - return false, err + return false, fmt.Errorf("failed decoding last revoked WAL entry for cluster (%v / %v): %w", index, cluster, err) } var deltaInfo lastDeltaInfo if err := lastBuildEntry.DecodeJSON(&deltaInfo); err != nil { - return false, err + return false, fmt.Errorf("failed decoding last built CRL WAL entry for cluster (%v / %v): %w", index, cluster, err) } - // Here, everything decoded properly and we know that no new certs - // have been revoked since we built this last delta CRL. We can exit - // without rebuilding then. - if walInfo.Serial == deltaInfo.Serial { - return false, nil + if walInfo.Serial != deltaInfo.Serial { + shouldRebuild = true + break } } - return true, nil + // No errors occurred, so return the result. + return shouldRebuild, nil } func (cb *crlBuilder) rebuildDeltaCRLs(sc *storageContext, forceNew bool) error { @@ -982,8 +1024,20 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( } sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key) + // From here on out, the certificate has been revoked locally. Any other + // persistence issues might still err, but any other failure messages + // should be added as warnings to the revocation. + resp := &logical.Response{ + Data: map[string]interface{}{ + "revocation_time": revInfo.RevocationTime, + "revocation_time_rfc3339": revInfo.RevocationTimeUTC.Format(time.RFC3339Nano), + "state": "revoked", + }, + } + // If this flag is enabled after the fact, existing local entries will be published to // the unified storage space through a periodic function. + failedWritingUnifiedCRL := false if config.UnifiedCRL { entry := &unifiedRevocationEntry{ SerialNumber: colonSerial, @@ -999,6 +1053,9 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( sc.Backend.Logger().Error("Failed to write unified revocation entry, will re-attempt later", "serial_number", colonSerial, "error", ignoreErr) sc.Backend.unifiedTransferStatus.forceRun() + + resp.AddWarning(fmt.Sprintf("Failed to write unified revocation entry, will re-attempt later: %v", err)) + failedWritingUnifiedCRL = true } } @@ -1017,26 +1074,20 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( } } } else if config.EnableDelta { - if err := writeRevocationDeltaWALs(sc, config, hyphenSerial, colonSerial); err != nil { + if err := writeRevocationDeltaWALs(sc, config, resp, failedWritingUnifiedCRL, hyphenSerial, colonSerial); err != nil { return nil, fmt.Errorf("failed to write WAL entries for Delta CRLs: %w", err) } } - return &logical.Response{ - Data: map[string]interface{}{ - "revocation_time": revInfo.RevocationTime, - "revocation_time_rfc3339": revInfo.RevocationTimeUTC.Format(time.RFC3339Nano), - "state": "revoked", - }, - }, nil + return resp, nil } -func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, hyphenSerial string, colonSerial string) error { +func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, resp *logical.Response, failedWritingUnifiedCRL bool, hyphenSerial string, colonSerial string) error { if err := writeSpecificRevocationDeltaWALs(sc, hyphenSerial, colonSerial, localDeltaWALPath); err != nil { return fmt.Errorf("failed to write local delta WAL entry: %w", err) } - if config.UnifiedCRL { + if config.UnifiedCRL && !failedWritingUnifiedCRL { // We only need to write cross-cluster unified Delta WAL entries when // it is enabled; in particular, because we rebuild CRLs when enabling // this flag, any revocations that happened prior to enabling unified @@ -1046,13 +1097,21 @@ func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, hyphenSeria // listing for the unified CRL rebuild, this revocation will not // appear on either the main or the next delta CRL, but will need to // wait for a subsequent complete CRL rebuild). + // + // Lastly, we don't attempt this if the unified CRL entry failed to + // write, as we need that entry before the delta WAL entry will make + // sense. if ignoredErr := writeSpecificRevocationDeltaWALs(sc, hyphenSerial, colonSerial, unifiedDeltaWALPath); ignoredErr != nil { // Just log the error if we fail to write across clusters, a separate background // thread will reattempt it later on as we have the local write done. sc.Backend.Logger().Error("Failed to write cross-cluster delta WAL entry, will re-attempt later", "serial_number", colonSerial, "error", ignoredErr) sc.Backend.unifiedTransferStatus.forceRun() + + resp.AddWarning(fmt.Sprintf("Failed to write cross-cluster delta WAL entry, will re-attempt later: %v", ignoredErr)) } + } else if failedWritingUnifiedCRL { + resp.AddWarning("Skipping cross-cluster delta WAL entry as cross-cluster revocation failed to write; will re-attempt later.") } return nil @@ -1275,7 +1334,7 @@ func buildAnyCRLs(sc *storageContext, forceNew bool, isDelta bool) error { } func getLastWALSerial(sc *storageContext, path string) (string, error) { - lastWALEntry, err := sc.Storage.Get(sc.Context, localDeltaWALLastRevokedSerial) + lastWALEntry, err := sc.Storage.Get(sc.Context, path) if err != nil { return "", err } @@ -1438,11 +1497,23 @@ func buildAnyUnifiedCRLs( // (and potentially more) in it; when we're done writing the delta CRL, // we'll write this serial as a sentinel to see if we need to rebuild it // in the future. - var lastDeltaSerial string + // + // We need to do this per-cluster. + lastDeltaSerial := map[string]string{} if isDelta { - lastDeltaSerial, err = getLastWALSerial(sc, unifiedDeltaWALLastRevokedSerial) + clusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix) if err != nil { - return nil, err + return nil, fmt.Errorf("error listing clusters for unified delta WAL building: %w", err) + } + + for index, cluster := range clusters { + path := unifiedDeltaWALPrefix + cluster + deltaWALLastRevokedSerialName + serial, err := getLastWALSerial(sc, path) + if err != nil { + return nil, fmt.Errorf("error getting last written Delta WAL serial for cluster (%v / %v): %w", index, cluster, err) + } + + lastDeltaSerial[cluster] = serial } } @@ -1513,12 +1584,20 @@ func buildAnyUnifiedCRLs( // for a while. sc.Backend.crlBuilder.lastDeltaRebuildCheck = time.Now() - if len(lastDeltaSerial) > 0 { - // When we have a last delta serial, write out the relevant info - // so we can skip extra CRL rebuilds. - deltaInfo := lastDeltaInfo{Serial: lastDeltaSerial} + // Persist all of our known last revoked serial numbers here, as the + // last seen serial during build. This will allow us to detect if any + // new revocations have occurred, forcing us to rebuild the delta CRL. + for cluster, serial := range lastDeltaSerial { + if len(serial) == 0 { + continue + } - lastDeltaBuildEntry, err := logical.StorageEntryJSON(unifiedDeltaWALLastBuildSerial, deltaInfo) + // Make sure to use the cluster-specific path. Since we're on the + // active node of the primary cluster, we own this entry and can + // safely write it. + path := unifiedDeltaWALPrefix + cluster + deltaWALLastBuildSerialName + deltaInfo := lastDeltaInfo{Serial: serial} + lastDeltaBuildEntry, err := logical.StorageEntryJSON(path, deltaInfo) if err != nil { return nil, fmt.Errorf("error creating last delta CRL rebuild serial entry: %w", err) } diff --git a/builtin/logical/pki/periodic.go b/builtin/logical/pki/periodic.go index 18032fbef..b8119e0cd 100644 --- a/builtin/logical/pki/periodic.go +++ b/builtin/logical/pki/periodic.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/vault/sdk/helper/consts" + "github.com/hashicorp/vault/sdk/logical" ) const ( @@ -90,7 +91,16 @@ func runUnifiedTransfer(sc *storageContext) { if err != nil { b.Logger().Error("an error occurred running unified transfer", "error", err.Error()) status.forceRerun.Store(true) + } else { + if config.EnableDelta { + err = doUnifiedTransferMissingDeltaWALSerials(sc, clusterId) + if err != nil { + b.Logger().Error("an error occurred running unified transfer", "error", err.Error()) + status.forceRerun.Store(true) + } + } } + status.lastRun = time.Now() } @@ -122,7 +132,7 @@ func doUnifiedTransferMissingLocalSerials(sc *storageContext, clusterId string) err := readRevocationEntryAndTransfer(sc, serialNum) if err != nil { errCount++ - sc.Backend.Logger().Debug("Failed transferring local revocation to unified space", + sc.Backend.Logger().Error("Failed transferring local revocation to unified space", "serial", serialNum, "error", err) } } @@ -135,6 +145,152 @@ func doUnifiedTransferMissingLocalSerials(sc *storageContext, clusterId string) return nil } +func doUnifiedTransferMissingDeltaWALSerials(sc *storageContext, clusterId string) error { + // We need to do a similar thing for Delta WAL entry certificates. + // When the delta WAL failed to write for one or more entries, + // we'll need to replicate these up to the primary cluster. When it + // has performed a new delta WAL build, it will empty storage and + // update to a last written WAL entry that exceeds what we've seen + // locally. + thisUnifiedWALEntryPath := unifiedDeltaWALPath + deltaWALLastRevokedSerialName + lastUnifiedWALEntry, err := getLastWALSerial(sc, thisUnifiedWALEntryPath) + if err != nil { + return fmt.Errorf("failed to fetch last cross-cluster unified revoked delta WAL serial number: %w", err) + } + + lastLocalWALEntry, err := getLastWALSerial(sc, localDeltaWALLastRevokedSerial) + if err != nil { + return fmt.Errorf("failed to fetch last locally revoked delta WAL serial number: %w", err) + } + + // We now need to transfer all the entries and then write the last WAL + // entry at the end. Start by listing all certificates; any missing + // certificates will be copied over and then the WAL entry will be + // updated once. + // + // We do not delete entries either locally or remotely, as either + // cluster could've rebuilt delta CRLs with out-of-sync information, + // removing some entries (and, we cannot differentiate between these + // two cases). On next full CRL rebuild (on either cluster), the state + // should get synchronized, and future delta CRLs after this function + // returns without issue will see the remaining entries. + // + // Lastly, we need to ensure we don't accidentally write any unified + // delta WAL entries that aren't present in the main cross-cluster + // revoked storage location. This would mean the above function failed + // to copy them for some reason, despite them presumably appearing + // locally. + _unifiedWALEntries, err := sc.Storage.List(sc.Context, unifiedDeltaWALPath) + if err != nil { + return fmt.Errorf("failed to list cross-cluster unified delta WAL storage: %w", err) + } + unifiedWALEntries := sliceToMapKey(_unifiedWALEntries) + + _unifiedRevokedSerials, err := listClusterSpecificUnifiedRevokedCerts(sc, clusterId) + if err != nil { + return fmt.Errorf("failed to list cross-cluster revoked certificates: %w", err) + } + unifiedRevokedSerials := sliceToMapKey(_unifiedRevokedSerials) + + localWALEntries, err := sc.Storage.List(sc.Context, localDeltaWALPath) + if err != nil { + return fmt.Errorf("failed to list local delta WAL storage: %w", err) + } + + if lastUnifiedWALEntry == lastLocalWALEntry && len(_unifiedWALEntries) == len(localWALEntries) { + // Writing the last revoked WAL entry is the last thing that we do. + // Because these entries match (across clusters) and we have the same + // number of entries, assume we don't have anything to sync and exit + // early. + // + // We need both checks as, in the event of PBPWF failing and then + // returning while more revocations are happening, we could have + // been schedule to run, but then skip running (if only the first + // condition was checked) because a later revocation succeeded + // in writing a unified WAL entry, before we started replicating + // the rest back up. + // + // The downside of this approach is that, if the main cluster + // does a full rebuild in the mean time, we could re-sync more + // entries back up to the primary cluster that are already + // included in the complete CRL. Users can manually rebuild the + // full CRL (clearing these duplicate delta CRL entries) if this + // affects them. + return nil + } + + errCount := 0 + for index, serial := range localWALEntries { + if index%25 == 0 { + config, _ := sc.Backend.crlBuilder.getConfigWithUpdate(sc) + if config != nil && (!config.UnifiedCRL || !config.EnableDelta) { + return errors.New("unified or delta CRLs have been disabled after we started, stopping") + } + } + + if serial == deltaWALLastBuildSerialName || serial == deltaWALLastRevokedSerialName { + // Skip our special serial numbers. + continue + } + + _, isAlreadyPresent := unifiedWALEntries[serial] + if isAlreadyPresent { + // Serial exists on both local and unified cluster. We're + // presuming we don't need to read and re-write these entries + // and that only missing entries need to be updated. + continue + } + + _, isRevokedCopied := unifiedRevokedSerials[serial] + if !isRevokedCopied { + // We need to wait here to copy over. + errCount += 1 + sc.Backend.Logger().Debug("Delta WAL exists locally, but corresponding cross-cluster full revocation entry is missing; skipping", "serial", serial) + continue + } + + // All good: read the local entry and write to the remote variant. + localPath := localDeltaWALPath + serial + unifiedPath := unifiedDeltaWALPath + serial + + entry, err := sc.Storage.Get(sc.Context, localPath) + if err != nil || entry == nil { + errCount += 1 + sc.Backend.Logger().Error("Failed reading local delta WAL entry to copy to cross-cluster", "serial", serial, "err", err) + continue + } + + entry.Key = unifiedPath + err = sc.Storage.Put(sc.Context, entry) + if err != nil { + errCount += 1 + sc.Backend.Logger().Error("Failed sync local delta WAL entry to cross-cluster unified delta WAL location", "serial", serial, "err", err) + continue + } + } + + if errCount > 0 { + // See note above about why we don't fail here. + sc.Backend.Logger().Warn(fmt.Sprintf("Failed transfering %d local delta WAL serials to unified storage", errCount)) + return nil + } + + // Everything worked. Here, we can write over the delta WAL last revoked + // value. By using the earlier value, even if new revocations have + // occurred, we ensure any further missing entries can be handled in the + // next round. + lastRevSerial := lastWALInfo{Serial: lastLocalWALEntry} + lastWALEntry, err := logical.StorageEntryJSON(thisUnifiedWALEntryPath, lastRevSerial) + if err != nil { + return fmt.Errorf("unable to create cross-cluster unified last delta CRL WAL entry: %w", err) + } + if err = sc.Storage.Put(sc.Context, lastWALEntry); err != nil { + return fmt.Errorf("error saving cross-cluster unified last delta CRL WAL entry: %w", err) + } + + return nil +} + func readRevocationEntryAndTransfer(sc *storageContext, serial string) error { hyphenSerial := normalizeSerial(serial) revInfo, err := sc.fetchRevocationInfo(hyphenSerial) diff --git a/changelog/20058.txt b/changelog/20058.txt new file mode 100644 index 000000000..e43a1f4ad --- /dev/null +++ b/changelog/20058.txt @@ -0,0 +1,3 @@ +```release-note:bug +secrets/pki: Fix building of unified delta CRLs and recovery during unified delta WAL write failures. +```