From 54ace7f34083090dcefa51c0fd9381436ccb8fa0 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 19 Sep 2024 15:47:13 -0700 Subject: [PATCH] Change the semantics of blob_garbage_collection_force_threshold to provide better control over space amp (#13022) Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13022 Currently, `blob_garbage_collection_force_threshold` applies to the oldest batch of blob files, which is typically only a small subset of the blob files currently eligible for garbage collection. This can result in a form of head-of-line blocking: no GC-triggered compactions will be scheduled if the oldest batch does not currently exceed the threshold, even if a lot of higher-numbered blob files do. This can in turn lead to high space amplification that exceeds the soft bound implicit in the force threshold (e.g. 50% would suggest a space amp of <2 and 75% would imply a space amp of <4). The patch changes the semantics of this configuration threshold to apply to the entire set of blob files that are eligible for garbage collection based on `blob_garbage_collection_age_cutoff`. This provides more intuitive semantics for the option and can provide a better write amp/space amp trade-off. (Note that GC-triggered compactions still pick the same SST files as before, so triggered GC still targets the oldest the blob files.) Reviewed By: jowlyzhang Differential Revision: D62977860 fbshipit-source-id: a999f31fe9cdda313de513f0e7a6fc707424d4a3 --- db/version_set.cc | 39 +++------- db/version_set_test.cc | 78 +------------------ db_stress_tool/db_stress_gflags.cc | 2 +- include/rocksdb/advanced_options.h | 13 ++-- ...edMutableColumnFamilyOptionsInterface.java | 9 +-- .../java/org/rocksdb/ColumnFamilyOptions.java | 9 +-- tools/db_bench_tool.cc | 2 +- ...blob_garbage_collection_force_threshold.md | 1 + 8 files changed, 32 insertions(+), 121 deletions(-) create mode 100644 unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md diff --git a/db/version_set.cc b/db/version_set.cc index c42a50cbaa..d28b2e2d96 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3772,14 +3772,17 @@ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( return; } - // Compute the sum of total and garbage bytes over the oldest batch of blob - // files. The oldest batch is defined as the set of blob files which are - // kept alive by the same SSTs as the very oldest one. Here is a toy example. - // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11, - // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and - // potentially some higher-numbered ones, while SST 3 relies on blob file 12 - // and potentially some higher-numbered ones. Then, the SST to oldest blob - // file mapping is as follows: + // Compute the sum of total and garbage bytes over the batch of blob files + // currently eligible for garbage collection based on + // blob_garbage_collection_age_cutoff, and if the garbage ratio exceeds + // blob_garbage_collection_force_threshold, schedule compaction for the + // SST files that reference the oldest batch of blob files. Here is a toy + // example. Let's assume we have three SSTs 1, 2, and 3, and four blob files + // 10, 11, 12, and 13, which correspond to the range that is eligible for GC + // and satisfy the garbage ratio threshold. Also, let's say SSTs 1 and 2 both + // rely on blob file 10 and potentially some higher-numbered ones, while SST 3 + // relies on blob file 12 and potentially some higher-numbered ones. Then, the + // SST to oldest blob file mapping is as follows: // // SST file number Oldest blob file number // 1 10 @@ -3797,11 +3800,6 @@ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( // // Then, the oldest batch of blob files consists of blob files 10 and 11, // and we can get rid of them by forcing the compaction of SSTs 1 and 2. - // - // Note that the overall ratio of garbage computed for the batch has to exceed - // blob_garbage_collection_force_threshold and the entire batch has to be - // eligible for GC according to blob_garbage_collection_age_cutoff in order - // for us to schedule any compactions. const auto& oldest_meta = blob_files_.front(); assert(oldest_meta); @@ -3818,25 +3816,10 @@ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( const auto& meta = blob_files_[count]; assert(meta); - if (!meta->GetLinkedSsts().empty()) { - // Found the beginning of the next batch of blob files - break; - } - sum_total_blob_bytes += meta->GetTotalBlobBytes(); sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); } - if (count < blob_files_.size()) { - const auto& meta = blob_files_[count]; - assert(meta); - - if (meta->GetLinkedSsts().empty()) { - // Some files in the oldest batch are not eligible for GC - return; - } - } - if (sum_garbage_blob_bytes < blob_garbage_collection_force_threshold * sum_total_blob_bytes) { return; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 4bc9806e22..a483ccf0e8 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -727,20 +727,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } - // Part of the oldest batch of blob files (specifically, #12 and #13) is - // ineligible for GC due to the age cutoff - - { - constexpr double age_cutoff = 0.5; - constexpr double force_threshold = 0.0; - vstorage_.ComputeFilesMarkedForForcedBlobGC( - age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); - - ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); - } - - // Oldest batch is eligible based on age cutoff but its overall garbage ratio - // is below threshold + // Overall garbage ratio of eligible files is below threshold { constexpr double age_cutoff = 1.0; @@ -751,8 +738,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } - // Oldest batch is eligible based on age cutoff and its overall garbage ratio - // meets threshold + // Overall garbage ratio of eligible files meets threshold { constexpr double age_cutoff = 1.0; @@ -878,20 +864,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } - // Part of the oldest batch of blob files (specifically, the second file) is - // ineligible for GC due to the age cutoff - - { - constexpr double age_cutoff = 0.25; - constexpr double force_threshold = 0.0; - vstorage_.ComputeFilesMarkedForForcedBlobGC( - age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); - - ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); - } - - // Oldest batch is eligible based on age cutoff but its overall garbage ratio - // is below threshold + // Overall garbage ratio of eligible files is below threshold { constexpr double age_cutoff = 0.5; @@ -902,8 +875,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } - // Oldest batch is eligible based on age cutoff and its overall garbage ratio - // meets threshold + // Overall garbage ratio of eligible files meets threshold { constexpr double age_cutoff = 0.5; @@ -929,48 +901,6 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); } - - // Now try the last two cases again with a greater than necessary age cutoff - - // Oldest batch is eligible based on age cutoff but its overall garbage ratio - // is below threshold - - { - constexpr double age_cutoff = 0.75; - constexpr double force_threshold = 0.6; - vstorage_.ComputeFilesMarkedForForcedBlobGC( - age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); - - ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); - } - - // Oldest batch is eligible based on age cutoff and its overall garbage ratio - // meets threshold - - { - constexpr double age_cutoff = 0.75; - constexpr double force_threshold = 0.5; - vstorage_.ComputeFilesMarkedForForcedBlobGC( - age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); - - auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); - ASSERT_EQ(ssts_to_be_compacted.size(), 2); - - std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(), - [](const std::pair& lhs, - const std::pair& rhs) { - assert(lhs.second); - assert(rhs.second); - return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); - }); - - const autovector> - expected_ssts_to_be_compacted{{level, level_files[0]}, - {level, level_files[1]}}; - - ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); - ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); - } } class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index c23a163a81..9f165cf977 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -497,7 +497,7 @@ DEFINE_double(blob_garbage_collection_force_threshold, ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() .blob_garbage_collection_force_threshold, "[Integrated BlobDB] The threshold for the ratio of garbage in " - "the oldest blob files for forcing garbage collection."); + "the eligible blob files for forcing garbage collection."); DEFINE_uint64(blob_compaction_readahead_size, ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 309d0c510a..0805e2aabe 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -929,13 +929,12 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API double blob_garbage_collection_age_cutoff = 0.25; - // If the ratio of garbage in the oldest blob files exceeds this threshold, - // targeted compactions are scheduled in order to force garbage collecting - // the blob files in question, assuming they are all eligible based on the - // value of blob_garbage_collection_age_cutoff above. This option is - // currently only supported with leveled compactions. - // Note that enable_blob_garbage_collection has to be set in order for this - // option to have any effect. + // If the ratio of garbage in the blob files currently eligible for garbage + // collection exceeds this threshold, targeted compactions are scheduled in + // order to force garbage collecting the oldest blob files. This option is + // currently only supported with leveled compactions. Note that + // enable_blob_garbage_collection has to be set in order for this option to + // have any effect. // // Default: 1.0 // diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java index c8fc841737..44e61c6d74 100644 --- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java @@ -733,11 +733,10 @@ public interface AdvancedMutableColumnFamilyOptionsInterface< double blobGarbageCollectionAgeCutoff(); /** - * If the ratio of garbage in the oldest blob files exceeds this threshold, - * targeted compactions are scheduled in order to force garbage collecting - * the blob files in question, assuming they are all eligible based on the - * value of {@link #blobGarbageCollectionAgeCutoff} above. This option is - * currently only supported with leveled compactions. + * If the ratio of garbage in the blob files currently eligible for garbage + * collection exceeds this threshold, targeted compactions are scheduled in + * order to force garbage collecting the oldest blob files. This option is + * currently only supported with leveled compactions. *

* Note that {@link #enableBlobGarbageCollection} has to be set in order for this * option to have any effect. diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index bb458078cd..3af4d2a8ed 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -1204,11 +1204,10 @@ public class ColumnFamilyOptions } /** - * If the ratio of garbage in the oldest blob files exceeds this threshold, - * targeted compactions are scheduled in order to force garbage collecting - * the blob files in question, assuming they are all eligible based on the - * value of {@link #blobGarbageCollectionAgeCutoff} above. This option is - * currently only supported with leveled compactions. + * If the ratio of garbage in the blob files currently eligible for garbage + * collection exceeds this threshold, targeted compactions are scheduled in + * order to force garbage collecting the oldest blob files. This option is + * currently only supported with leveled compactions. *

* Note that {@link #enableBlobGarbageCollection} has to be set in order for this * option to have any effect. diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index d51dbf30ad..ebb5583514 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1104,7 +1104,7 @@ DEFINE_double(blob_garbage_collection_force_threshold, ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() .blob_garbage_collection_force_threshold, "[Integrated BlobDB] The threshold for the ratio of garbage in " - "the oldest blob files for forcing garbage collection."); + "the eligible blob files for forcing garbage collection."); DEFINE_uint64(blob_compaction_readahead_size, ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() diff --git a/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md b/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md new file mode 100644 index 0000000000..0c4b8bba25 --- /dev/null +++ b/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md @@ -0,0 +1 @@ +Changed the semantics of the BlobDB configuration option `blob_garbage_collection_force_threshold` to define a threshold for the overall garbage ratio of all blob files currently eligible for garbage collection (according to `blob_garbage_collection_age_cutoff`). This can provide better control over space amplification at the cost of slightly higher write amplification.