mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-02 01:16:16 +00:00
f97e33454f
Summary:
A recent crash test failure shows that auto recovery from WAL write failure can cause CFs to be inconsistent. A unit test repro in P1569398553. The following is an example sequence of events:
```
0. manual_wal_flush is true. There are multiple CFs in a DB.
1. Submit a write batch with updates to multiple CF
2. A FlushWAL or a memtable swtich that will try to write the buffered WAL data. Fail this write so that buffered WAL data is dropped: 4b1d595306/file/writable_file_writer.cc (L624)
The error needs to be retryable to start background auto recovery.
3. One CF successfully flushes its memtable during auto recovery.
4. Crash the process.
5. Reopen the DB, one CF will have the update as a result of successful flush. Other CFs will miss all the updates in the write batch since WAL does not have them.
```
This can happen if a users configures manual_wal_flush, uses more than one CF, and can hit retryable error for WAL writes. This PR is a short-term fix that upgrades WAL related errors to fatal and not trigger auto recovery.
A long-term fix may be not drop buffered WAL data by checking how much data is actually written, or require atomically flushing all column families during error recovery from this kind of errors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12995
Test Plan:
added unit test to check error severity and if recovery is triggered. A crash test repro command that fails in a few runs before this PR:
```
python3 ./tools/db_crashtest.py blackbox --interval=60 --metadata_write_fault_one_in=1000 --column_families=10 --exclude_wal_from_write_fault_injection=0 --manual_wal_flush_one_in=1000 --WAL_size_limit_MB=10240 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --adm_policy=1 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=1 --async_io=0 --auto_readahead_size=0 --avoid_flush_during_recovery=1 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --block_align=1 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=2147483647 --bottommost_compression_type=none --bottommost_file_compaction_delay=0 --bytes_per_sync=0 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=33554432 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_readahead_size=1048576 --compaction_ttl=0 --compress_format_version=1 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=4 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=0 --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kCold --default_write_temperature=kWarm --delete_obsolete_files_period_micros=30000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=1000000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=0 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=1 --fail_if_options_file_error=1 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=6 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=1000000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --index_block_restart_interval=4 --index_shortening=1 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kWarm --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=5120 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=2097152 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=0 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=2 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=2 --open_files=100 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --optimize_filters_for_hits=0 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=0 --partition_pinning=2 --pause_background_one_in=10000 --periodic_compaction_seconds=0 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --report_bg_io_stats=0 --reset_stats_one_in=10000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --set_options_one_in=10000 --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=1048576 --sqfc_name=bar --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=600 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=8 --universal_max_read_amp=-1 --unpartitioned_pinning=2 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=0 --use_attribute_group=1 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=1 --use_multi_cf_iterator=1 --use_multi_get_entity=0 --use_multiget=0 --use_put_entity_one_in=1 --use_sqfc_for_range_queries=0 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=1 --verify_db_one_in=100000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=4194304 --write_dbid_to_manifest=0 --write_fault_one_in=50 --writepercent=35 --ops_per_thread=100000 --preserve_unverified_changes=1
```
Reviewed By: hx235
Differential Revision: D62888510
Pulled By: cbi42
fbshipit-source-id: 308bdbbb8d897cc8eba950155cd0e37cf7eb76fe
333 lines
9.6 KiB
C++
333 lines
9.6 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#ifndef NDEBUG
|
|
|
|
#include "db/column_family.h"
|
|
#include "db/db_impl/db_impl.h"
|
|
#include "db/error_handler.h"
|
|
#include "db/periodic_task_scheduler.h"
|
|
#include "monitoring/thread_status_updater.h"
|
|
#include "util/cast_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
uint64_t DBImpl::TEST_GetLevel0TotalSize() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
|
|
}
|
|
|
|
Status DBImpl::TEST_SwitchWAL() {
|
|
WriteContext write_context;
|
|
InstrumentedMutexLock l(&mutex_);
|
|
void* writer = TEST_BeginWrite();
|
|
auto s = SwitchWAL(&write_context);
|
|
TEST_EndWrite(writer);
|
|
return s;
|
|
}
|
|
|
|
uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
|
|
ColumnFamilyHandle* column_family) {
|
|
ColumnFamilyData* cfd;
|
|
if (column_family == nullptr) {
|
|
cfd = default_cf_handle_->cfd();
|
|
} else {
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
cfd = cfh->cfd();
|
|
}
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
|
|
}
|
|
|
|
void DBImpl::TEST_GetFilesMetaData(
|
|
ColumnFamilyHandle* column_family,
|
|
std::vector<std::vector<FileMetaData>>* metadata,
|
|
std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata) {
|
|
assert(metadata);
|
|
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
assert(cfh);
|
|
|
|
auto cfd = cfh->cfd();
|
|
assert(cfd);
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
const auto* current = cfd->current();
|
|
assert(current);
|
|
|
|
const auto* vstorage = current->storage_info();
|
|
assert(vstorage);
|
|
|
|
metadata->resize(NumberLevels());
|
|
|
|
for (int level = 0; level < NumberLevels(); ++level) {
|
|
const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
|
|
|
|
(*metadata)[level].clear();
|
|
(*metadata)[level].reserve(files.size());
|
|
|
|
for (const auto& f : files) {
|
|
(*metadata)[level].push_back(*f);
|
|
}
|
|
}
|
|
|
|
if (blob_metadata) {
|
|
*blob_metadata = vstorage->GetBlobFiles();
|
|
}
|
|
}
|
|
|
|
uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
|
|
return versions_->manifest_file_number();
|
|
}
|
|
|
|
uint64_t DBImpl::TEST_Current_Next_FileNo() {
|
|
return versions_->current_next_file_number();
|
|
}
|
|
|
|
Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
|
|
const Slice* end,
|
|
ColumnFamilyHandle* column_family,
|
|
bool disallow_trivial_move) {
|
|
ColumnFamilyData* cfd;
|
|
if (column_family == nullptr) {
|
|
cfd = default_cf_handle_->cfd();
|
|
} else {
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
cfd = cfh->cfd();
|
|
}
|
|
int output_level =
|
|
(cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
|
|
? level
|
|
: level + 1;
|
|
return RunManualCompaction(
|
|
cfd, level, output_level, CompactRangeOptions(), begin, end, true,
|
|
disallow_trivial_move,
|
|
std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
|
|
"" /*trim_ts*/);
|
|
}
|
|
|
|
Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
|
|
WriteContext write_context;
|
|
InstrumentedMutexLock l(&mutex_);
|
|
if (cfd == nullptr) {
|
|
cfd = default_cf_handle_->cfd();
|
|
}
|
|
|
|
Status s;
|
|
void* writer = TEST_BeginWrite();
|
|
if (two_write_queues_) {
|
|
WriteThread::Writer nonmem_w;
|
|
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
|
s = SwitchMemtable(cfd, &write_context);
|
|
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
|
} else {
|
|
s = SwitchMemtable(cfd, &write_context);
|
|
}
|
|
TEST_EndWrite(writer);
|
|
return s;
|
|
}
|
|
|
|
Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
|
|
ColumnFamilyHandle* cfh) {
|
|
FlushOptions fo;
|
|
fo.wait = wait;
|
|
fo.allow_write_stall = allow_write_stall;
|
|
ColumnFamilyData* cfd;
|
|
if (cfh == nullptr) {
|
|
cfd = default_cf_handle_->cfd();
|
|
} else {
|
|
auto cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
|
|
cfd = cfhi->cfd();
|
|
}
|
|
return FlushMemTable(cfd, fo, FlushReason::kTest);
|
|
}
|
|
|
|
Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
|
|
const FlushOptions& flush_opts) {
|
|
return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
|
|
}
|
|
|
|
Status DBImpl::TEST_AtomicFlushMemTables(
|
|
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
|
|
const FlushOptions& flush_opts) {
|
|
return AtomicFlushMemTables(flush_opts, FlushReason::kTest,
|
|
provided_candidate_cfds);
|
|
}
|
|
|
|
Status DBImpl::TEST_WaitForBackgroundWork() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
WaitForBackgroundWork();
|
|
return error_handler_.GetBGError();
|
|
}
|
|
|
|
Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
|
|
ColumnFamilyData* cfd;
|
|
if (column_family == nullptr) {
|
|
cfd = default_cf_handle_->cfd();
|
|
} else {
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
cfd = cfh->cfd();
|
|
}
|
|
return WaitForFlushMemTable(cfd, nullptr, false);
|
|
}
|
|
|
|
Status DBImpl::TEST_WaitForCompact() {
|
|
return WaitForCompact(WaitForCompactOptions());
|
|
}
|
|
Status DBImpl::TEST_WaitForCompact(
|
|
const WaitForCompactOptions& wait_for_compact_options) {
|
|
return WaitForCompact(wait_for_compact_options);
|
|
}
|
|
|
|
Status DBImpl::TEST_WaitForPurge() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) {
|
|
bg_cv_.Wait();
|
|
}
|
|
return error_handler_.GetBGError();
|
|
}
|
|
|
|
Status DBImpl::TEST_GetBGError() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return error_handler_.GetBGError();
|
|
}
|
|
|
|
bool DBImpl::TEST_IsRecoveryInProgress() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return error_handler_.IsRecoveryInProgress();
|
|
}
|
|
|
|
void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
|
|
|
|
void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
|
|
|
|
void DBImpl::TEST_SignalAllBgCv() { bg_cv_.SignalAll(); }
|
|
|
|
void* DBImpl::TEST_BeginWrite() {
|
|
auto w = new WriteThread::Writer();
|
|
write_thread_.EnterUnbatched(w, &mutex_);
|
|
return static_cast<void*>(w);
|
|
}
|
|
|
|
void DBImpl::TEST_EndWrite(void* w) {
|
|
auto writer = static_cast<WriteThread::Writer*>(w);
|
|
write_thread_.ExitUnbatched(writer);
|
|
delete writer;
|
|
}
|
|
|
|
size_t DBImpl::TEST_LogsToFreeSize() {
|
|
InstrumentedMutexLock l(&log_write_mutex_);
|
|
return logs_to_free_.size();
|
|
}
|
|
|
|
uint64_t DBImpl::TEST_LogfileNumber() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return logfile_number_;
|
|
}
|
|
|
|
Status DBImpl::TEST_GetAllImmutableCFOptions(
|
|
std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
|
|
std::vector<std::string> cf_names;
|
|
std::vector<const ImmutableCFOptions*> iopts;
|
|
{
|
|
InstrumentedMutexLock l(&mutex_);
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
cf_names.push_back(cfd->GetName());
|
|
iopts.push_back(cfd->ioptions());
|
|
}
|
|
}
|
|
iopts_map->clear();
|
|
for (size_t i = 0; i < cf_names.size(); ++i) {
|
|
iopts_map->insert({cf_names[i], iopts[i]});
|
|
}
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
|
|
return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
|
|
}
|
|
|
|
size_t DBImpl::TEST_PreparedSectionCompletedSize() {
|
|
return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
|
|
}
|
|
|
|
size_t DBImpl::TEST_LogsWithPrepSize() {
|
|
return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
|
|
}
|
|
|
|
uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
|
|
autovector<MemTable*> empty_list;
|
|
return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
|
|
}
|
|
|
|
Status DBImpl::TEST_GetLatestMutableCFOptions(
|
|
ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
*mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
|
|
return Status::OK();
|
|
}
|
|
|
|
int DBImpl::TEST_BGCompactionsAllowed() const {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return GetBGJobLimits().max_compactions;
|
|
}
|
|
|
|
int DBImpl::TEST_BGFlushesAllowed() const {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return GetBGJobLimits().max_flushes;
|
|
}
|
|
|
|
SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
|
|
if (last_seq_same_as_publish_seq_) {
|
|
return versions_->LastSequence();
|
|
} else {
|
|
return versions_->LastAllocatedSequence();
|
|
}
|
|
}
|
|
|
|
size_t DBImpl::TEST_GetWalPreallocateBlockSize(
|
|
uint64_t write_buffer_size) const {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return GetWalPreallocateBlockSize(write_buffer_size);
|
|
}
|
|
|
|
void DBImpl::TEST_WaitForPeriodicTaskRun(std::function<void()> callback) const {
|
|
periodic_task_scheduler_.TEST_WaitForRun(callback);
|
|
}
|
|
|
|
const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const {
|
|
return periodic_task_scheduler_;
|
|
}
|
|
|
|
SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return seqno_to_time_mapping_;
|
|
}
|
|
|
|
const autovector<uint64_t>& DBImpl::TEST_GetFilesToQuarantine() const {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
return error_handler_.GetFilesToQuarantine();
|
|
}
|
|
|
|
void DBImpl::TEST_DeleteObsoleteFiles() {
|
|
InstrumentedMutexLock l(&mutex_);
|
|
DeleteObsoleteFiles();
|
|
}
|
|
|
|
size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
|
|
InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
|
|
return EstimateInMemoryStatsHistorySize();
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
#endif // NDEBUG
|