Sync WAL during db Close() (#12556)

Summary:
**Context/Summary:**
Below crash test found out we don't sync WAL upon DB close, which can lead to unsynced data loss. This PR syncs it.
```
./db_stress --threads=1 --disable_auto_compactions=1 --WAL_size_limit_MB=0 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=1 --allow_data_in_errors=True --allow_fallocate=0 --async_io=0 --auto_readahead_size=0 --avoid_flush_during_recovery=1 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=1000000 --block_align=0 --block_protection_bytes_per_key=2 --block_size=16384 --bloom_before_level=1 --bloom_bits=29.895303579352174 --bottommost_compression_type=disable --bottommost_file_compaction_delay=0 --bytes_per_sync=0 --cache_index_and_filter_blocks=0 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=33554432 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=1 --checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=0 --compaction_readahead_size=0 --compaction_style=0 --compaction_ttl=0 --compress_format_version=2 --compressed_secondary_cache_ratio=0 --compressed_secondary_cache_size=0 --compression_checksum=1 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=4 --compression_type=zstd --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kUnknown --delete_obsolete_files_period_micros=0 --delpercent=0 --delrangepercent=0 --destroy_db_initially=1 --detect_filter_construct_corruption=1 --disable_wal=0 --dump_malloc_stats=0 --enable_checksum_handoff=0 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=1 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=0 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=0 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=none --fill_cache=0 --flush_one_in=1000 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=0 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=6 --index_shortening=0 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=16384 --iterpercent=0 --key_len_percent_dist=1,30,69 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=1 --lock_wal_one_in=0 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=16777216 --long_running_snapshots=0 --low_pri_pool_ratio=0 --lowest_used_cache_tier=0 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=2500000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=8 --max_total_wal_size=0 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=0 --memtable_insert_hint_per_batch=0 --memtable_max_range_deletions=0 --memtable_prefix_bloom_size_ratio=0.5 --memtable_protection_bytes_per_key=1 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=0 --min_write_buffer_number_to_merge=1 --mmap_read=0 --mock_direct_io=True --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --num_levels=1 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=3 --optimize_filters_for_hits=1 --optimize_filters_for_memory=1 --optimize_multiget_for_io=0 --paranoid_file_checks=0 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=0 --periodic_compaction_seconds=0 --prefix_size=1 --prefixpercent=0 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=16384 --readpercent=0 --recycle_log_file_num=0 --reopen=2 --report_bg_io_stats=1 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=0 --soft_pending_compaction_bytes_limit=68719476736 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=3 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=6 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=3 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000 --verify_compression=0 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=100

 Verification failed for column family 0 key 000000000000B9D1000000000000012B000000000000017D (4756691): value_from_db: , value_from_expected: 010000000504070609080B0A0D0C0F0E111013121514171619181B1A1D1C1F1E212023222524272629282B2A2D2C2F2E313033323534373639383B3A3D3C3F3E, msg: Iterator verification: Value not found: NotFound:
Verification failed :(
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12556

Test Plan:
- New UT
- Same stress test command failed before this fix but pass after
- CI

Reviewed By: ajkr

Differential Revision: D56267964

Pulled By: hx235

fbshipit-source-id: af1b7e8769c129f64ba1c7f1ff17102f1239b929
This commit is contained in:
Hui Xiao 2024-05-20 17:33:43 -07:00 committed by Facebook GitHub Bot
parent ef1d4955ba
commit d7b938882e
13 changed files with 83 additions and 25 deletions

View File

@ -626,6 +626,19 @@ Status DBImpl::CloseHelper() {
job_context.Clean();
mutex_.Lock();
}
if (!mutable_db_options_.avoid_sync_during_shutdown && !logs_.empty()) {
mutex_.Unlock();
Status s = SyncWAL();
mutex_.Lock();
if (!s.ok()) {
ROCKS_LOG_WARN(immutable_db_options_.info_log,
"Unable to sync WALs with error -- %s",
s.ToString().c_str());
if (ret.ok()) {
ret = s;
}
}
}
{
InstrumentedMutexLock lock(&log_write_mutex_);
for (auto l : logs_to_free_) {
@ -637,7 +650,7 @@ Status DBImpl::CloseHelper() {
if (!s.ok()) {
ROCKS_LOG_WARN(
immutable_db_options_.info_log,
"Unable to Sync WAL file %s with error -- %s",
"Unable to clear writer for WAL %s with error -- %s",
LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
s.ToString().c_str());
// Retain the first error

View File

@ -12,6 +12,7 @@
// file data (or entire files) not protected by a "sync".
#include "db/db_impl/db_impl.h"
#include "db/db_test_util.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "env/mock_env.h"
@ -629,6 +630,34 @@ INSTANTIATE_TEST_CASE_P(
std::make_tuple(false, kSyncWal, kEnd),
std::make_tuple(true, kSyncWal, kEnd)));
class FaultInjectionFSTest : public DBTestBase {
public:
FaultInjectionFSTest()
: DBTestBase("fault_injection_fs_test", /*env_do_fsync=*/false) {}
};
TEST_F(FaultInjectionFSTest, SyncWALDuringDBClose) {
std::shared_ptr<FaultInjectionTestFS> fault_fs =
std::make_shared<FaultInjectionTestFS>(env_->GetFileSystem());
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fault_fs));
Options options = CurrentOptions();
options.avoid_sync_during_shutdown = true;
options.env = env.get();
Reopen(options);
ASSERT_OK(Put("k1", "v1"));
Close();
Reopen(options);
ASSERT_EQ("NOT_FOUND", Get("k1"));
Destroy(options);
options.avoid_sync_during_shutdown = false;
Reopen(options);
ASSERT_OK(Put("k1", "v1"));
Close();
Reopen(options);
ASSERT_EQ("v1", Get("k1"));
Destroy(options);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

View File

@ -379,6 +379,7 @@ DECLARE_uint64(WAL_ttl_seconds);
DECLARE_uint64(WAL_size_limit_MB);
DECLARE_bool(strict_bytes_per_sync);
DECLARE_bool(avoid_flush_during_shutdown);
DECLARE_bool(avoid_sync_during_shutdown);
DECLARE_bool(fill_cache);
DECLARE_bool(optimize_multiget_for_io);
DECLARE_bool(memtable_insert_hint_per_batch);

View File

@ -976,6 +976,10 @@ DEFINE_bool(avoid_flush_during_recovery,
ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
"Avoid flush during recovery");
DEFINE_bool(avoid_sync_during_shutdown,
ROCKSDB_NAMESPACE::Options().avoid_sync_during_shutdown,
"Options.avoid_sync_during_shutdown");
DEFINE_uint64(max_write_batch_group_size_bytes,
ROCKSDB_NAMESPACE::Options().max_write_batch_group_size_bytes,
"Max write batch group size");

View File

@ -3787,6 +3787,7 @@ void InitializeOptionsFromFlags(
options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
options.strict_bytes_per_sync = FLAGS_strict_bytes_per_sync;
options.avoid_flush_during_shutdown = FLAGS_avoid_flush_during_shutdown;
options.avoid_sync_during_shutdown = FLAGS_avoid_sync_during_shutdown;
options.dump_malloc_stats = FLAGS_dump_malloc_stats;
options.stats_history_buffer_size = FLAGS_stats_history_buffer_size;
options.skip_stats_update_on_db_open = FLAGS_skip_stats_update_on_db_open;

View File

@ -64,7 +64,7 @@ IOStatus WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data,
uint32_t crc32c_checksum) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
StopWatch sw(clock_, stats_, hist_type_,
@ -199,7 +199,7 @@ IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data,
IOStatus WritableFileWriter::Pad(const IOOptions& opts,
const size_t pad_bytes) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
const IOOptions io_options = FinalizeIOOptions(opts);
assert(pad_bytes < kDefaultPageSize);
@ -348,7 +348,7 @@ IOStatus WritableFileWriter::Close(const IOOptions& opts) {
// enabled
IOStatus WritableFileWriter::Flush(const IOOptions& opts) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
const IOOptions io_options = FinalizeIOOptions(opts);
@ -458,7 +458,7 @@ IOStatus WritableFileWriter::PrepareIOOptions(const WriteOptions& wo,
IOStatus WritableFileWriter::Sync(const IOOptions& opts, bool use_fsync) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
IOOptions io_options = FinalizeIOOptions(opts);
@ -483,7 +483,7 @@ IOStatus WritableFileWriter::Sync(const IOOptions& opts, bool use_fsync) {
IOStatus WritableFileWriter::SyncWithoutFlush(const IOOptions& opts,
bool use_fsync) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
IOOptions io_options = FinalizeIOOptions(opts);
if (!writable_file_->IsSyncThreadSafe()) {
@ -495,9 +495,6 @@ IOStatus WritableFileWriter::SyncWithoutFlush(const IOOptions& opts,
IOStatus s = SyncInternal(io_options, use_fsync);
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
if (!s.ok()) {
#ifndef NDEBUG
sync_without_flush_called_ = true;
#endif // NDEBUG
set_seen_error();
}
return s;
@ -543,7 +540,7 @@ IOStatus WritableFileWriter::SyncInternal(const IOOptions& opts,
IOStatus WritableFileWriter::RangeSync(const IOOptions& opts, uint64_t offset,
uint64_t nbytes) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
IOSTATS_TIMER_GUARD(range_sync_nanos);
@ -572,7 +569,7 @@ IOStatus WritableFileWriter::RangeSync(const IOOptions& opts, uint64_t offset,
IOStatus WritableFileWriter::WriteBuffered(const IOOptions& opts,
const char* data, size_t size) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
IOStatus s;
@ -663,7 +660,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts,
const char* data,
size_t size) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
IOStatus s;
@ -877,7 +874,7 @@ IOStatus WritableFileWriter::WriteDirect(const IOOptions& opts) {
IOStatus WritableFileWriter::WriteDirectWithChecksum(const IOOptions& opts) {
if (seen_error()) {
return AssertFalseAndGetStatusForPrevError();
return GetWriterHasPreviousErrorStatus();
}
assert(use_direct_io());

View File

@ -149,13 +149,6 @@ class WritableFileWriter {
uint64_t next_write_offset_;
bool pending_sync_;
std::atomic<bool> seen_error_;
#ifndef NDEBUG
// SyncWithoutFlush() is the function that is allowed to be called
// concurrently with other function. One of the concurrent call
// could set seen_error_, and the other one would hit assertion
// in debug mode.
std::atomic<bool> sync_without_flush_called_ = false;
#endif // NDEBUG
uint64_t last_sync_size_;
uint64_t bytes_per_sync_;
RateLimiter* rate_limiter_;
@ -304,9 +297,7 @@ class WritableFileWriter {
}
void set_seen_error() { seen_error_.store(true, std::memory_order_relaxed); }
IOStatus AssertFalseAndGetStatusForPrevError() {
// This should only happen if SyncWithoutFlush() was called.
assert(sync_without_flush_called_);
IOStatus GetWriterHasPreviousErrorStatus() {
return IOStatus::IOError("Writer has previous error.");
}

View File

@ -1265,6 +1265,15 @@ struct DBOptions {
// Dynamically changeable through SetDBOptions() API.
bool avoid_flush_during_shutdown = false;
// By default RocksDB will not sync WAL on DB close even if there are
// unpersisted data (i.e. unsynced WAL data). This can speedup
// DB close. Unpersisted data WILL BE LOST.
//
// DEFAULT: true
//
// Dynamically changeable through SetDBOptions() API.
bool avoid_sync_during_shutdown = true;
// Set this option to true during creation of database if you want
// to be able to ingest behind (call IngestExternalFile() skipping keys
// that already exist, rather than overwriting matching keys).

View File

@ -72,6 +72,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct MutableDBOptions, avoid_flush_during_shutdown),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"avoid_sync_during_shutdown",
{offsetof(struct MutableDBOptions, avoid_sync_during_shutdown),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"writable_file_max_buffer_size",
{offsetof(struct MutableDBOptions, writable_file_max_buffer_size),
OptionType::kSizeT, OptionVerificationType::kNormal,
@ -990,6 +994,7 @@ MutableDBOptions::MutableDBOptions()
max_background_compactions(-1),
max_subcompactions(0),
avoid_flush_during_shutdown(false),
avoid_sync_during_shutdown(true),
writable_file_max_buffer_size(1024 * 1024),
delayed_write_rate(2 * 1024U * 1024U),
max_total_wal_size(0),
@ -1009,6 +1014,7 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
max_background_compactions(options.max_background_compactions),
max_subcompactions(options.max_subcompactions),
avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
avoid_sync_during_shutdown(options.avoid_sync_during_shutdown),
writable_file_max_buffer_size(options.writable_file_max_buffer_size),
delayed_write_rate(options.delayed_write_rate),
max_total_wal_size(options.max_total_wal_size),
@ -1034,6 +1040,8 @@ void MutableDBOptions::Dump(Logger* log) const {
max_subcompactions);
ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_shutdown: %d",
avoid_flush_during_shutdown);
ROCKS_LOG_HEADER(log, " Options.avoid_sync_during_shutdown: %d",
avoid_sync_during_shutdown);
ROCKS_LOG_HEADER(
log, " Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
writable_file_max_buffer_size);

View File

@ -125,6 +125,7 @@ struct MutableDBOptions {
int max_background_compactions;
uint32_t max_subcompactions;
bool avoid_flush_during_shutdown;
bool avoid_sync_during_shutdown;
size_t writable_file_max_buffer_size;
uint64_t delayed_write_rate;
uint64_t max_total_wal_size;

View File

@ -348,6 +348,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"allow_2pc=false;"
"avoid_flush_during_recovery=false;"
"avoid_flush_during_shutdown=false;"
"avoid_sync_during_shutdown=false;"
"allow_ingest_behind=false;"
"concurrent_prepare=false;"
"two_write_queues=false;"

View File

@ -271,6 +271,7 @@ default_params = {
"WAL_size_limit_MB": lambda: random.choice([0, 1]),
"strict_bytes_per_sync": lambda: random.choice([0, 1]),
"avoid_flush_during_shutdown": lambda: random.choice([0, 1]),
"avoid_sync_during_shutdown": lambda: random.choice([0, 1]),
"fill_cache": lambda: random.choice([0, 1]),
"optimize_multiget_for_io": lambda: random.choice([0, 1]),
"memtable_insert_hint_per_batch": lambda: random.choice([0, 1]),
@ -739,11 +740,12 @@ def finalize_and_sanitize(src_params):
# files, which would be problematic when unsynced data can be lost in
# crash recoveries.
dest_params["enable_compaction_filter"] = 0
# TODO(hx235): re-enable "reopen" after supporting unsynced data loss
# TODO(hx235): re-enable "reopen" after supporting unsynced data loss
# verification upon reopen. Currently reopen does not restore expected state
# with potential data loss in mind like start of each `./db_stress` run.
# Therefore it always expects no data loss.
dest_params["reopen"] = 0
if (dest_params.get("avoid_sync_during_shutdown") == 1):
dest_params["reopen"] = 0
# Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
# unordered_write is only enabled with --txn, and txn_params disables inplace_update_support, so
# setting allow_concurrent_memtable_write=1 won't conflcit with inplace_update_support.

View File

@ -0,0 +1 @@
Introduce a new DB option `avoid_sync_during_shutdown`. If set true, it will sync WALs during DB close