mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-01 16:15:59 +00:00
e484b81eee
Summary: **Context:** Below crash test revealed a bug that directory containing CURRENT file (short for `dir_contains_current_file` below) was not always get synced after a new CURRENT is created and being called with `RenameFile` as part of the creation. This bug exposes a risk that such un-synced directory containing the updated CURRENT can’t survive a host crash (e.g, power loss) hence get corrupted. This then will be followed by a recovery from a corrupted CURRENT that we don't want. The root-cause is that a nullptr `FSDirectory* dir_contains_current_file` sometimes gets passed-down to `SetCurrentFile()` hence in those case `dir_contains_current_file->FSDirectory::FsyncWithDirOptions()` will be skipped (which otherwise will internally call`Env/FS::SyncDic()` ) ``` ./db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_data_in_errors=True --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --block_size=16384 --bloom_bits=134.8015470676662 --bottommost_compression_type=disable --cache_size=8388608 --checkpoint_one_in=1000000 --checksum_type=kCRC32c --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=2 --compaction_ttl=100 --compression_max_dict_buffer_bytes=511 --compression_max_dict_bytes=16384 --compression_type=zstd --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=65536 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --disable_wal=0 --enable_compaction_filter=0 --enable_pipelined_write=1 --expected_values_dir=$exp --fail_if_options_file_error=1 --file_checksum_impl=none --flush_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --ingest_external_file_one_in=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --mark_for_compaction_one_file_in=10 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=16384 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=1 --memtable_whole_key_filtering=1 --mmap_read=1 --nooverwritepercent=1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_pinning=2 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=5 --prefixpercent=5 --prepopulate_block_cache=1 --progress_reports=0 --read_fault_one_in=1000 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --ribbon_starting_level=999 --secondary_cache_fault_one_in=32 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --subcompactions=3 --sync_fault_injection=1 --target_file_size_base=2097 --target_file_size_multiplier=2 --test_batches_snapshots=1 --top_level_index_pinning=1 --use_full_merge_v1=1 --use_merge=1 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --write_buffer_size=4194 --writepercent=35 ``` ``` stderr: WARNING: prefix_size is non-zero but memtablerep != prefix_hash db_stress: utilities/fault_injection_fs.cc:748: virtual rocksdb::IOStatus rocksdb::FaultInjectionTestFS::RenameFile(const std::string &, const std::string &, const rocksdb::IOOptions &, rocksdb::IODebugContext *): Assertion `tlist.find(tdn.second) == tlist.end()' failed.` ``` **Summary:** The PR ensured the non-test path pass down a non-null dir containing CURRENT (which is by current RocksDB assumption just db_dir) by doing the following: - Renamed `directory_to_fsync` as `dir_contains_current_file` in `SetCurrentFile()` to tighten the association between this directory and CURRENT file - Changed `SetCurrentFile()` API to require `dir_contains_current_file` being passed-in, instead of making it by default nullptr. - Because `SetCurrentFile()`'s `dir_contains_current_file` is passed down from `VersionSet::LogAndApply()` then `VersionSet::ProcessManifestWrites()` (i.e, think about this as a chain of 3 functions related to MANIFEST update), these 2 functions also got refactored to require `dir_contains_current_file` - Updated the non-test-path callers of these 3 functions to obtain and pass in non-nullptr `dir_contains_current_file`, which by current assumption of RocksDB, is the `FSDirectory* db_dir`. - `db_impl` path will obtain `DBImpl::directories_.getDbDir()` while others with no access to such `directories_` are obtained on the fly by creating such object `FileSystem::NewDirectory(..)` and manage it by unique pointers to ensure short life time. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10573 Test Plan: - `make check` - Passed the repro db_stress command - For future improvement, since we currently don't assert dir containing CURRENT to be non-nullptr due to https://github.com/facebook/rocksdb/pull/10573#pullrequestreview-1087698899, there is still chances that future developers mistakenly pass down nullptr dir containing CURRENT thus resulting skipped sync dir and cause the bug again. Therefore a smarter test (e.g, such as quoted from ajkr "(make) unsynced data loss to be dropping files corresponding to unsynced directory entries") is still needed. Reviewed By: ajkr Differential Revision: D39005886 Pulled By: hx235 fbshipit-source-id: 336fb9090d0cfa6ca3dd580db86268007dde7f5a
154 lines
4.6 KiB
C++
154 lines
4.6 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "rocksdb/experimental.h"
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
#include "db/version_util.h"
|
|
#include "logging/logging.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace experimental {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
|
|
const Slice* begin, const Slice* end) {
|
|
if (db == nullptr) {
|
|
return Status::InvalidArgument("DB is empty");
|
|
}
|
|
|
|
return db->SuggestCompactRange(column_family, begin, end);
|
|
}
|
|
|
|
Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
|
|
if (db == nullptr) {
|
|
return Status::InvalidArgument("Didn't recognize DB object");
|
|
}
|
|
return db->PromoteL0(column_family, target_level);
|
|
}
|
|
|
|
#else // ROCKSDB_LITE
|
|
|
|
Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
|
|
const Slice* /*begin*/, const Slice* /*end*/) {
|
|
return Status::NotSupported("Not supported in RocksDB LITE");
|
|
}
|
|
|
|
Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
|
|
int /*target_level*/) {
|
|
return Status::NotSupported("Not supported in RocksDB LITE");
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
|
|
return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
|
|
}
|
|
|
|
Status UpdateManifestForFilesState(
|
|
const DBOptions& db_opts, const std::string& db_name,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
const UpdateManifestForFilesStateOptions& opts) {
|
|
OfflineManifestWriter w(db_opts, db_name);
|
|
Status s = w.Recover(column_families);
|
|
|
|
size_t files_updated = 0;
|
|
size_t cfs_updated = 0;
|
|
auto fs = db_opts.env->GetFileSystem();
|
|
|
|
for (auto cfd : *w.Versions().GetColumnFamilySet()) {
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
assert(cfd);
|
|
|
|
if (cfd->IsDropped() || !cfd->initialized()) {
|
|
continue;
|
|
}
|
|
|
|
const auto* current = cfd->current();
|
|
assert(current);
|
|
|
|
const auto* vstorage = current->storage_info();
|
|
assert(vstorage);
|
|
|
|
VersionEdit edit;
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
/* SST files */
|
|
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
const auto& level_files = vstorage->LevelFiles(level);
|
|
|
|
for (const auto& lf : level_files) {
|
|
assert(lf);
|
|
|
|
uint64_t number = lf->fd.GetNumber();
|
|
std::string fname =
|
|
TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId());
|
|
|
|
std::unique_ptr<FSSequentialFile> f;
|
|
FileOptions fopts;
|
|
fopts.temperature = lf->temperature;
|
|
|
|
IOStatus file_ios =
|
|
fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr);
|
|
if (file_ios.ok()) {
|
|
if (opts.update_temperatures) {
|
|
Temperature temp = f->GetTemperature();
|
|
if (temp != Temperature::kUnknown && temp != lf->temperature) {
|
|
// Current state inconsistent with manifest
|
|
++files_updated;
|
|
edit.DeleteFile(level, number);
|
|
edit.AddFile(
|
|
level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
|
|
lf->smallest, lf->largest, lf->fd.smallest_seqno,
|
|
lf->fd.largest_seqno, lf->marked_for_compaction, temp,
|
|
lf->oldest_blob_file_number, lf->oldest_ancester_time,
|
|
lf->file_creation_time, lf->file_checksum,
|
|
lf->file_checksum_func_name, lf->unique_id);
|
|
}
|
|
}
|
|
} else {
|
|
s = file_ios;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (s.ok() && edit.NumEntries() > 0) {
|
|
std::unique_ptr<FSDirectory> db_dir;
|
|
s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr);
|
|
if (s.ok()) {
|
|
s = w.LogAndApply(cfd, &edit, db_dir.get());
|
|
}
|
|
if (s.ok()) {
|
|
++cfs_updated;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cfs_updated > 0) {
|
|
ROCKS_LOG_INFO(db_opts.info_log,
|
|
"UpdateManifestForFilesState: updated %zu files in %zu CFs",
|
|
files_updated, cfs_updated);
|
|
} else if (s.ok()) {
|
|
ROCKS_LOG_INFO(db_opts.info_log,
|
|
"UpdateManifestForFilesState: no updates needed");
|
|
}
|
|
if (!s.ok()) {
|
|
ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s",
|
|
s.ToString().c_str());
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
} // namespace experimental
|
|
} // namespace ROCKSDB_NAMESPACE
|