rocksdb/db_stress_tool/expected_value.cc

123 lines
3.9 KiB
C++
Raw Normal View History

// Copyright (c) 2021-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifdef GFLAGS
#include "db_stress_tool/expected_value.h"
#include <atomic>
namespace ROCKSDB_NAMESPACE {
Fix false-positive TestBackupRestore corruption (#12917) Summary: **Context:** https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread. The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario: (1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already. (2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed. ``` std::vector<PendingExpectedValue> pending_expected_values = shared->PrepareDeleteRange(rand_column_family, rand_key, rand_key + FLAGS_range_deletion_width, &prepared); if (!prepared) { for (PendingExpectedValue& pending_expected_value : pending_expected_values) { pending_expected_value.PermitUnclosedPendingState(); } return s; } ``` (3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed. ``` for (int64_t key = key_base; s.ok() && key < shared->GetMaxKey() && static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width; ++key) PendingExpectedValue pending_expected_value = shared->PreparePut(column_family, key, &prepared); if (!prepared) { pending_expected_value.PermitUnclosedPendingState(); for (PendingExpectedValue& pev : pending_expected_values) { pev.PermitUnclosedPendingState(); } return; } } ``` (4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early. ``` bool Exists() const { return PendingPut() || !IsDeleted(); } TestBackupRestore() { ... Status get_status = restored_db->Get( read_opts, restored_cf_handles[rand_column_families[i]], key, &restored_value); bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]); if (get_status.ok()) { if (!exists && from_latest && ShouldAcquireMutexOnKey()) { std::ostringstream oss; oss << "0x" << key.ToString(true) << " exists in restore but not in original db"; s = Status::Corruption(oss.str()); } } else if (get_status.IsNotFound()) { if (exists && from_latest && ShouldAcquireMutexOnKey()) { std::ostringstream oss; oss << "0x" << key.ToString(true) << " exists in original db but not in restore"; s = Status::Corruption(oss.str()); } } ... } ``` So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore` A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread. The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below. ``` void StressTest::OperateDb(ThreadState* thread) { for (uint64_t i = 0; i < ops_per_open; i++) { ... int64_t rand_key = GenerateOneKey(thread, i); ... } } ``` **Summary:** - Removed the "lock release" part and related checks - Printed recovery time if the write thread waited more than 10 seconds - Reverted regression in testing coverage when deleting a non-existent key Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917 Test Plan: Below command repro-ed frequently before the fix and not after. ``` ./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35 ``` Reviewed By: cbi42 Differential Revision: D60890580 Pulled By: hx235 fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
void ExpectedValue::Put(bool pending) {
if (pending) {
SetPendingWrite();
} else {
SetValueBase(NextValueBase());
ClearDeleted();
ClearPendingWrite();
}
}
bool ExpectedValue::Delete(bool pending) {
Fix improper ExpectedValute::Exists() usages and disable compaction during VerifyDB() in crash test (#12933) Summary: **Context:** Adding assertion `!PendingPut()&&!PendingDelete()` in `ExpectedValute::Exists()` surfaced a couple improper usages of `ExpectedValute::Exists()` in the crash test - Commit phase of `ExpectedValue::Delete()`/`SyncDelete()`: When we issue delete to expected value during commit phase or `SyncDelete()` (used in crash recovery verification) as below, we don't really care about the result. https://github.com/facebook/rocksdb/blob/d458331ee90d0e8b5abd90e9f340d6857f7f679d/db_stress_tool/expected_state.cc#L73 https://github.com/facebook/rocksdb/blob/d458331ee90d0e8b5abd90e9f340d6857f7f679d/db_stress_tool/expected_value.cc#L52 That means, we don't really need to check for `Exists()` https://github.com/facebook/rocksdb/blob/d458331ee90d0e8b5abd90e9f340d6857f7f679d/db_stress_tool/expected_value.cc#L24-L26. This actually gives an alternative solution to https://github.com/facebook/rocksdb/commit/b65e29a4a905dfc94c20a6c6266cd8e9f8ea7448 to solve false-positive assertion violation. - TestMultiGetXX() path: `Exists()` is called without holding the lock as required https://github.com/facebook/rocksdb/blame/f63428bcc7c42308ec1a84e82787b8d5dbd322ae/db_stress_tool/no_batched_ops_stress.cc#L2688 ``` void MaybeAddKeyToTxnForRYW( ThreadState* thread, int column_family, int64_t key, Transaction* txn, std::unordered_map<std::string, ExpectedValue>& ryw_expected_values) { assert(thread); assert(txn); SharedState* const shared = thread->shared; assert(shared); if (!shared->AllowsOverwrite(key) && shared->Exists(column_family, key)) { // Just do read your write checks for keys that allow overwrites. return; } // With a 1 in 10 probability, insert the just added key in the batch // into the transaction. This will create an overlap with the MultiGet // keys and exercise some corner cases in the code if (thread->rand.OneIn(10)) { ``` https://github.com/facebook/rocksdb/blob/f63428bcc7c42308ec1a84e82787b8d5dbd322ae/db_stress_tool/expected_state.h#L74-L76 The assertion also failed if db stress compaction filter was invoked before crash recovery verification (`VerifyDB()`->`VerifyOrSyncValue()`) finishes. https://github.com/facebook/rocksdb/blob/f63428bcc7c42308ec1a84e82787b8d5dbd322ae/db_stress_tool/db_stress_compaction_filter.h#L53 It failed because it can encounter a key with pending state when checking for `Exists()` since that key's expected state has not been sync-ed with db state in `VerifyOrSyncValue()`. https://github.com/facebook/rocksdb/blob/f63428bcc7c42308ec1a84e82787b8d5dbd322ae/db_stress_tool/no_batched_ops_stress.cc#L2579-L2591 **Summary:** This PR fixes above issues by - not checking `Exists()` in commit phase/`SyncDelete()` - using the concurrent version of key existence check like in other read - conditionally temporarily disabling compaction till after crash recovery verification succeeds() And add back the assertion `!PendingPut()&&!PendingDelete()` Pull Request resolved: https://github.com/facebook/rocksdb/pull/12933 Test Plan: Rehearsal CI Reviewed By: cbi42 Differential Revision: D61214889 Pulled By: hx235 fbshipit-source-id: ef25ba896e64330ddf330182314981516880c3e4
2024-08-15 19:32:59 +00:00
if (pending && !Exists()) {
return false;
}
if (pending) {
SetPendingDel();
} else {
SetDelCounter(NextDelCounter());
SetDeleted();
ClearPendingDel();
}
return true;
}
void ExpectedValue::SyncPut(uint32_t value_base) {
assert(ExpectedValue::IsValueBaseValid(value_base));
SetValueBase(value_base);
ClearDeleted();
ClearPendingWrite();
// This is needed in case crash happens during a pending delete of the key
// assocated with this expected value
ClearPendingDel();
}
void ExpectedValue::SyncPendingPut() { Put(true /* pending */); }
void ExpectedValue::SyncDelete() {
Delete(false /* pending */);
// This is needed in case crash happens during a pending write of the key
// assocated with this expected value
ClearPendingWrite();
}
uint32_t ExpectedValue::GetFinalValueBase() const {
return PendingWrite() ? NextValueBase() : GetValueBase();
}
uint32_t ExpectedValue::GetFinalDelCounter() const {
return PendingDelete() ? NextDelCounter() : GetDelCounter();
}
bool ExpectedValueHelper::MustHaveNotExisted(
ExpectedValue pre_read_expected_value,
ExpectedValue post_read_expected_value) {
const bool pre_read_expected_deleted = pre_read_expected_value.IsDeleted();
const uint32_t pre_read_expected_value_base =
pre_read_expected_value.GetValueBase();
const uint32_t post_read_expected_final_value_base =
post_read_expected_value.GetFinalValueBase();
const bool during_read_no_write_happened =
(pre_read_expected_value_base == post_read_expected_final_value_base);
return pre_read_expected_deleted && during_read_no_write_happened;
}
bool ExpectedValueHelper::MustHaveExisted(
ExpectedValue pre_read_expected_value,
ExpectedValue post_read_expected_value) {
const bool pre_read_expected_not_deleted =
!pre_read_expected_value.IsDeleted();
const uint32_t pre_read_expected_del_counter =
pre_read_expected_value.GetDelCounter();
const uint32_t post_read_expected_final_del_counter =
post_read_expected_value.GetFinalDelCounter();
const bool during_read_no_delete_happened =
(pre_read_expected_del_counter == post_read_expected_final_del_counter);
return pre_read_expected_not_deleted && during_read_no_delete_happened;
}
bool ExpectedValueHelper::InExpectedValueBaseRange(
uint32_t value_base, ExpectedValue pre_read_expected_value,
ExpectedValue post_read_expected_value) {
assert(ExpectedValue::IsValueBaseValid(value_base));
const uint32_t pre_read_expected_value_base =
pre_read_expected_value.GetValueBase();
const uint32_t post_read_expected_final_value_base =
post_read_expected_value.GetFinalValueBase();
if (pre_read_expected_value_base <= post_read_expected_final_value_base) {
const uint32_t lower_bound = pre_read_expected_value_base;
const uint32_t upper_bound = post_read_expected_final_value_base;
return lower_bound <= value_base && value_base <= upper_bound;
} else {
const uint32_t upper_bound_1 = post_read_expected_final_value_base;
const uint32_t lower_bound_2 = pre_read_expected_value_base;
const uint32_t upper_bound_2 = ExpectedValue::GetValueBaseMask();
return (value_base <= upper_bound_1) ||
(lower_bound_2 <= value_base && value_base <= upper_bound_2);
}
}
} // namespace ROCKSDB_NAMESPACE
#endif // GFLAGS