diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 0a8541e039..9fdc4f4668 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -1339,6 +1339,14 @@ TEST_P(DBBlockCacheTypeTest, AddRedundantStats) { EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); } +namespace { +std::string AltKey(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "altkey%06d", i); + return std::string(buf); +} +} // namespace + TEST_P(DBBlockCacheTypeTest, Uncache) { for (bool partitioned : {false, true}) { SCOPED_TRACE("partitioned=" + std::to_string(partitioned)); @@ -1349,10 +1357,11 @@ TEST_P(DBBlockCacheTypeTest, Uncache) { Options options = CurrentOptions(); options.uncache_aggressiveness = ua; options.create_if_missing = true; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); // Don't allow background operations to keep Versions referenced options.stats_dump_period_sec = 0; options.stats_persist_period_sec = 0; + auto stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics = stats; const size_t capacity = size_t{1} << 25; const int num_shard_bits = 0; // 1 shard @@ -1401,6 +1410,8 @@ TEST_P(DBBlockCacheTypeTest, Uncache) { // Combine into one file, making the originals obsolete ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + for (int i = 0; i < kNumDataBlocks; i++) { ASSERT_NE(Get(Key(i)), "NOT_FOUND"); } @@ -1420,6 +1431,75 @@ TEST_P(DBBlockCacheTypeTest, Uncache) { EXPECT_LT(cache->GetUsage(), kNumDataBlocks * table_options.block_size * 2U); } + + size_t alt_baseline_count = cache->GetOccupancyCount(); + size_t alt_baseline_usage = cache->GetUsage(); + ASSERT_OK(stats->Reset()); + // We aren't generally cleaning up cache entries on DB::Close, especially + // because someone might just re-open the same DB. + Reopen(options); + for (int i = 0; i < kNumDataBlocks; i++) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } + + EXPECT_EQ(cache->GetOccupancyCount(), alt_baseline_count); + EXPECT_EQ(cache->GetUsage(), alt_baseline_usage); + + // Check for unnecessary unncessary cache churn + ASSERT_EQ(stats->getTickerCount(BLOCK_CACHE_ADD), 0U); + ASSERT_EQ(stats->getTickerCount(BLOCK_CACHE_MISS), 0U); + ASSERT_GT(stats->getTickerCount(BLOCK_CACHE_HIT), 0U); + + // And now do a similar test as above except with trivial moves, making + // sure that we aren't falsely uncaching in that case, which would cause + // unnecessary cache misses. Using AltKey instead of Key to avoid + // interference. + for (int i = 0; i < kNumDataBlocks; i++) { + // No overlap + ASSERT_OK( + Put(AltKey(i), Random::GetTLSInstance()->RandomBinaryString( + static_cast(table_options.block_size)))); + if (i >= kNumDataBlocks - kNumFiles) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(int{kNumFiles}, NumTableFilesAtLevel(0)); + + for (int i = 0; i < kNumDataBlocks; i++) { + ASSERT_NE(Get(AltKey(i)), "NOT_FOUND"); + } + + ASSERT_EQ(cache->GetOccupancyCount(), + alt_baseline_count + kNumDataBlocks + + meta_blocks_per_file * kNumFiles); + ASSERT_GE(cache->GetUsage(), + alt_baseline_usage + kNumDataBlocks * table_options.block_size); + + ASSERT_OK(stats->Reset()); + + // Make trivial move + { + auto a = AltKey(0); + auto b = AltKey(kNumDataBlocks); + Slice slice_a{a}; + Slice slice_b{b}; + ASSERT_OK(db_->CompactRange({}, &slice_a, &slice_b)); + } + ASSERT_EQ(/*old*/ 1 + /*new*/ int{kNumFiles}, NumTableFilesAtLevel(1)); + + for (int i = 0; i < kNumDataBlocks; i++) { + ASSERT_NE(Get(AltKey(i)), "NOT_FOUND"); + } + + // Should be the same if trivial move + ASSERT_EQ(cache->GetOccupancyCount(), + alt_baseline_count + kNumDataBlocks + + meta_blocks_per_file * kNumFiles); + + // Check for unnecessary unncessary cache churn + ASSERT_EQ(stats->getTickerCount(BLOCK_CACHE_ADD), 0U); + ASSERT_EQ(stats->getTickerCount(BLOCK_CACHE_MISS), 0U); + ASSERT_GT(stats->getTickerCount(BLOCK_CACHE_HIT), 0U); } } } diff --git a/db/db_test.cc b/db/db_test.cc index 879636176c..fbcff5b48b 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -6353,6 +6353,8 @@ TEST_F(DBTest, PromoteL0) { Options options = CurrentOptions(); options.disable_auto_compactions = true; options.write_buffer_size = 10 * 1024 * 1024; + // Exercise what was a use-after-free (ASAN failure) under ~VersionSet() + options.uncache_aggressiveness = 300; DestroyAndReopen(options); // non overlapping ranges diff --git a/db/version_set.cc b/db/version_set.cc index a1bf7033ef..454fca0503 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5196,17 +5196,21 @@ Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) { } VersionSet::~VersionSet() { - // we need to delete column_family_set_ because its destructor depends on - // VersionSet + // Must clean up column families to make all files "obsolete" column_family_set_.reset(); + for (auto& file : obsolete_files_) { if (file.metadata->table_reader_handle) { // NOTE: DB is shutting down, so file is probably not obsolete, just // no longer referenced by Versions in memory. // For more context, see comment on "table_cache_->EraseUnRefEntries()" // in DBImpl::CloseHelper(). - table_cache_->Release(file.metadata->table_reader_handle); - TableCache::Evict(table_cache_, file.metadata->fd.GetNumber()); + // Using uncache_aggressiveness=0 overrides any previous marking to + // attempt to uncache the file's blocks (which after cleaning up + // column families could cause use-after-free) + TableCache::ReleaseObsolete(table_cache_, + file.metadata->table_reader_handle, + /*uncache_aggressiveness=*/0); } file.DeleteMetadata(); } diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 42104d7aef..af46b4dffb 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -136,10 +136,10 @@ extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; BlockBasedTable::~BlockBasedTable() { - if (rep_->uncache_aggressiveness > 0 && rep_->table_options.block_cache) { + auto ua = rep_->uncache_aggressiveness.LoadRelaxed(); + if (ua > 0 && rep_->table_options.block_cache) { if (rep_->filter) { - rep_->filter->EraseFromCacheBeforeDestruction( - rep_->uncache_aggressiveness); + rep_->filter->EraseFromCacheBeforeDestruction(ua); } if (rep_->index_reader) { { @@ -160,7 +160,7 @@ BlockBasedTable::~BlockBasedTable() { // without I/O. (NOTE: It's extremely unlikely that a data block // will be in block cache without the index block pointing to it // also in block cache.) - UncacheAggressivenessAdvisor advisor(rep_->uncache_aggressiveness); + UncacheAggressivenessAdvisor advisor(ua); for (iiter->SeekToFirst(); iiter->Valid() && advisor.ShouldContinue(); iiter->Next()) { bool erased = EraseFromCache(iiter->value().handle); @@ -170,8 +170,7 @@ BlockBasedTable::~BlockBasedTable() { } // Un-cache the index block(s) - rep_->index_reader->EraseFromCacheBeforeDestruction( - rep_->uncache_aggressiveness); + rep_->index_reader->EraseFromCacheBeforeDestruction(ua); } } delete rep_; @@ -3296,7 +3295,7 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, } void BlockBasedTable::MarkObsolete(uint32_t uncache_aggressiveness) { - rep_->uncache_aggressiveness = uncache_aggressiveness; + rep_->uncache_aggressiveness.StoreRelaxed(uncache_aggressiveness); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index b4d0f17119..d940386d12 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -33,6 +33,7 @@ #include "table/table_reader.h" #include "table/two_level_iterator.h" #include "trace_replay/block_cache_tracer.h" +#include "util/atomic.h" #include "util/coro_utils.h" #include "util/hash_containers.h" @@ -675,8 +676,11 @@ struct BlockBasedTable::Rep { const bool user_defined_timestamps_persisted; // Set to >0 when the file is known to be obsolete and should have its block - // cache entries evicted on close. - uint32_t uncache_aggressiveness = 0; + // cache entries evicted on close. NOTE: when the file becomes obsolete, + // there could be multiple table cache references that all mark this file as + // obsolete. An atomic resolves the race quite reasonably. Even in the rare + // case of such a race, they will most likely be storing the same value. + RelaxedAtomic uncache_aggressiveness{0}; std::unique_ptr table_reader_cache_res_handle = nullptr; diff --git a/table/table_reader.h b/table/table_reader.h index d6b72480d8..9faf8c1c3e 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -191,7 +191,9 @@ class TableReader { // Tell the reader that the file should now be obsolete, e.g. as a hint // to delete relevant cache entries on destruction. (It might not be safe - // to "unpin" cache entries until destruction time.) + // to "unpin" cache entries until destruction time.) NOTE: must be thread + // safe because multiple table cache references might all mark this file as + // obsolete when they are released (the last of which destroys this reader). virtual void MarkObsolete(uint32_t /*uncache_aggressiveness*/) { // no-op as default }