diff --git a/db/builder.cc b/db/builder.cc index 8c2c624b07..08a9fecc72 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -460,9 +460,18 @@ Status BuildTable( Status prepare = WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts); if (prepare.ok()) { + // FIXME: track file for "slow" deletion, e.g. into the + // VersionSet::obsolete_files_ pipeline Status ignored = fs->DeleteFile(fname, opts, dbg); ignored.PermitUncheckedError(); } + // Ensure we don't leak table cache entries when throwing away output + // files. (The usual logic in PurgeObsoleteFiles is not applicable because + // this function deletes the obsolete file itself, while they should + // probably go into the VersionSet::obsolete_files_ pipeline.) + TableCache::ReleaseObsolete(table_cache->get_cache().get(), + meta->fd.GetNumber(), nullptr /*handle*/, + mutable_cf_options.uncache_aggressiveness); } assert(blob_file_additions || blob_file_paths.empty()); diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc index aae446351f..2accc0a94d 100644 --- a/db/compaction/subcompaction_state.cc +++ b/db/compaction/subcompaction_state.cc @@ -34,9 +34,16 @@ void SubcompactionState::Cleanup(Cache* cache) { if (!status.ok()) { for (const auto& out : GetOutputs()) { - // If this file was inserted into the table cache then remove - // them here because this compaction was not committed. - TableCache::Evict(cache, out.meta.fd.GetNumber()); + // If this file was inserted into the table cache then remove it here + // because this compaction was not committed. This is not strictly + // required because of a backstop TableCache::Evict() in + // PurgeObsoleteFiles() but is our opportunity to apply + // uncache_aggressiveness. TODO: instead, put these files into the + // VersionSet::obsolete_files_ pipeline so that they don't have to + // be picked up by scanning the DB directory. + TableCache::ReleaseObsolete( + cache, out.meta.fd.GetNumber(), nullptr /*handle*/, + compaction->mutable_cf_options()->uncache_aggressiveness); } } // TODO: sub_compact.io_status is not checked like status. Not sure if thats diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index df264dcd33..912f910389 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -338,31 +338,48 @@ void DBImpl::TEST_VerifyNoObsoleteFilesCached( l.emplace(&mutex_); } - std::vector live_files; + if (!opened_successfully_) { + // We don't need to pro-actively clean up open files during DB::Open() + // if we know we are about to fail and clean up in Close(). + return; + } + if (disable_delete_obsolete_files_ > 0) { + // For better or worse, DB::Close() is allowed with deletions disabled. + // Since we generally associate clean-up of open files with deleting them, + // we allow "obsolete" open files when deletions are disabled. + return; + } + + // Live and "quarantined" files are allowed to be open in table cache + std::set live_and_quar_files; for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->IsDropped()) { continue; } - // Sneakily add both SST and blob files to the same list - cfd->current()->AddLiveFiles(&live_files, &live_files); - } - std::sort(live_files.begin(), live_files.end()); + // Iterate over live versions + Version* current = cfd->current(); + Version* ver = current; + do { + // Sneakily add both SST and blob files to the same list + std::vector live_files_vec; + ver->AddLiveFiles(&live_files_vec, &live_files_vec); + live_and_quar_files.insert(live_files_vec.begin(), live_files_vec.end()); - auto fn = [&live_files](const Slice& key, Cache::ObjectPtr, size_t, - const Cache::CacheItemHelper* helper) { - if (helper != BlobFileCache::GetHelper()) { - // Skip non-blob files for now - // FIXME: diagnose and fix the leaks of obsolete SST files revealed in - // unit tests. - return; - } + ver = ver->Next(); + } while (ver != current); + } + { + const auto& quar_files = error_handler_.GetFilesToQuarantine(); + live_and_quar_files.insert(quar_files.begin(), quar_files.end()); + } + auto fn = [&live_and_quar_files](const Slice& key, Cache::ObjectPtr, size_t, + const Cache::CacheItemHelper*) { // See TableCache and BlobFileCache assert(key.size() == sizeof(uint64_t)); uint64_t file_number; GetUnaligned(reinterpret_cast(key.data()), &file_number); - // Assert file is in sorted live_files - assert( - std::binary_search(live_files.begin(), live_files.end(), file_number)); + // Assert file is in live/quarantined set + assert(live_and_quar_files.find(file_number) != live_and_quar_files.end()); }; table_cache_->ApplyToAllEntries(fn, {}); } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 96178b7065..bba8c064ca 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -449,14 +449,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // File is being deleted (actually obsolete) auto number = file.metadata->fd.GetNumber(); candidate_files.emplace_back(MakeTableFileName(number), file.path); - if (handle == nullptr) { - // For files not "pinned" in table cache - handle = TableCache::Lookup(table_cache_.get(), number); - } - if (handle) { - TableCache::ReleaseObsolete(table_cache_.get(), handle, - file.uncache_aggressiveness); - } + TableCache::ReleaseObsolete(table_cache_.get(), number, handle, + file.uncache_aggressiveness); } file.DeleteMetadata(); } @@ -572,9 +566,17 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { case kTableFile: // If the second condition is not there, this makes // DontDeletePendingOutputs fail + // FIXME: but should NOT keep if it came from sst_delete_files? keep = (sst_live_set.find(number) != sst_live_set.end()) || number >= state.min_pending_output; if (!keep) { + // NOTE: sometimes redundant (if came from sst_delete_files) + // We don't know which column family is applicable here so we don't + // know what uncache_aggressiveness would be used with + // ReleaseObsolete(). Anyway, obsolete files ideally go into + // sst_delete_files for better/quicker handling, and this is just a + // backstop. + TableCache::Evict(table_cache_.get(), number); files_to_del.insert(number); } break; diff --git a/db/table_cache.cc b/db/table_cache.cc index 28b25b880c..e3aac4e1ac 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -726,13 +726,19 @@ uint64_t TableCache::ApproximateSize( return result; } -void TableCache::ReleaseObsolete(Cache* cache, Cache::Handle* h, +void TableCache::ReleaseObsolete(Cache* cache, uint64_t file_number, + Cache::Handle* h, uint32_t uncache_aggressiveness) { CacheInterface typed_cache(cache); TypedHandle* table_handle = reinterpret_cast(h); - TableReader* table_reader = typed_cache.Value(table_handle); - table_reader->MarkObsolete(uncache_aggressiveness); - typed_cache.ReleaseAndEraseIfLastRef(table_handle); + if (table_handle == nullptr) { + table_handle = typed_cache.Lookup(GetSliceForFileNumber(&file_number)); + } + if (table_handle != nullptr) { + TableReader* table_reader = typed_cache.Value(table_handle); + table_reader->MarkObsolete(uncache_aggressiveness); + typed_cache.ReleaseAndEraseIfLastRef(table_handle); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/db/table_cache.h b/db/table_cache.h index 5fd0123bc7..701f96f22a 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -159,12 +159,14 @@ class TableCache { bool skip_range_deletions = false, int level = -1, TypedHandle* table_handle = nullptr); - // Evict any entry for the specified file number + // Evict any entry for the specified file number. ReleaseObsolete() is + // preferred for cleaning up from obsolete files. static void Evict(Cache* cache, uint64_t file_number); // Handles releasing, erasing, etc. of what should be the last reference - // to an obsolete file. - static void ReleaseObsolete(Cache* cache, Cache::Handle* handle, + // to an obsolete file. `handle` may be nullptr if no prior handle is known. + static void ReleaseObsolete(Cache* cache, uint64_t file_number, + Cache::Handle* handle, uint32_t uncache_aggressiveness); // Return handle to an existing cache entry if there is one diff --git a/db/version_set.cc b/db/version_set.cc index d06eb71202..457fddd6a8 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5204,18 +5204,16 @@ VersionSet::~VersionSet() { column_family_set_.reset(); for (auto& file : obsolete_files_) { - if (file.metadata->table_reader_handle) { - // NOTE: DB is shutting down, so file is probably not obsolete, just - // no longer referenced by Versions in memory. - // For more context, see comment on "table_cache_->EraseUnRefEntries()" - // in DBImpl::CloseHelper(). - // Using uncache_aggressiveness=0 overrides any previous marking to - // attempt to uncache the file's blocks (which after cleaning up - // column families could cause use-after-free) - TableCache::ReleaseObsolete(table_cache_, - file.metadata->table_reader_handle, - /*uncache_aggressiveness=*/0); - } + // NOTE: DB is shutting down, so file is probably not obsolete, just + // no longer referenced by Versions in memory. + // For more context, see comment on "table_cache_->EraseUnRefEntries()" + // in DBImpl::CloseHelper(). + // Using uncache_aggressiveness=0 overrides any previous marking to + // attempt to uncache the file's blocks (which after cleaning up + // column families could cause use-after-free) + TableCache::ReleaseObsolete(table_cache_, file.metadata->fd.GetNumber(), + file.metadata->table_reader_handle, + /*uncache_aggressiveness=*/0); file.DeleteMetadata(); } obsolete_files_.clear(); diff --git a/unreleased_history/bug_fixes/open_sst_file_leaks.md b/unreleased_history/bug_fixes/open_sst_file_leaks.md new file mode 100644 index 0000000000..a75e3be541 --- /dev/null +++ b/unreleased_history/bug_fixes/open_sst_file_leaks.md @@ -0,0 +1 @@ +* Fix leaks of some open SST files (until `DB::Close()`) that are written but never become live due to various failures. (We now have a check for such leaks with no outstanding issues.)