2023-05-18 16:44:15 +00:00
|
|
|
// Copyright (c) 2021-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#ifdef GFLAGS
|
|
|
|
|
|
|
|
#include "db_stress_tool/expected_value.h"
|
|
|
|
|
|
|
|
#include <atomic>
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
void ExpectedValue::Put(bool pending) {
|
2023-05-18 16:44:15 +00:00
|
|
|
if (pending) {
|
|
|
|
SetPendingWrite();
|
|
|
|
} else {
|
|
|
|
SetValueBase(NextValueBase());
|
|
|
|
ClearDeleted();
|
|
|
|
ClearPendingWrite();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ExpectedValue::Delete(bool pending) {
|
2024-08-15 19:32:59 +00:00
|
|
|
if (pending && !Exists()) {
|
2023-05-18 16:44:15 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (pending) {
|
|
|
|
SetPendingDel();
|
|
|
|
} else {
|
|
|
|
SetDelCounter(NextDelCounter());
|
|
|
|
SetDeleted();
|
|
|
|
ClearPendingDel();
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ExpectedValue::SyncPut(uint32_t value_base) {
|
|
|
|
assert(ExpectedValue::IsValueBaseValid(value_base));
|
|
|
|
|
|
|
|
SetValueBase(value_base);
|
|
|
|
ClearDeleted();
|
|
|
|
ClearPendingWrite();
|
|
|
|
|
|
|
|
// This is needed in case crash happens during a pending delete of the key
|
|
|
|
// assocated with this expected value
|
|
|
|
ClearPendingDel();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ExpectedValue::SyncPendingPut() { Put(true /* pending */); }
|
|
|
|
|
|
|
|
void ExpectedValue::SyncDelete() {
|
|
|
|
Delete(false /* pending */);
|
|
|
|
// This is needed in case crash happens during a pending write of the key
|
|
|
|
// assocated with this expected value
|
|
|
|
ClearPendingWrite();
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ExpectedValue::GetFinalValueBase() const {
|
|
|
|
return PendingWrite() ? NextValueBase() : GetValueBase();
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ExpectedValue::GetFinalDelCounter() const {
|
|
|
|
return PendingDelete() ? NextDelCounter() : GetDelCounter();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ExpectedValueHelper::MustHaveNotExisted(
|
|
|
|
ExpectedValue pre_read_expected_value,
|
|
|
|
ExpectedValue post_read_expected_value) {
|
|
|
|
const bool pre_read_expected_deleted = pre_read_expected_value.IsDeleted();
|
|
|
|
|
|
|
|
const uint32_t pre_read_expected_value_base =
|
|
|
|
pre_read_expected_value.GetValueBase();
|
|
|
|
|
|
|
|
const uint32_t post_read_expected_final_value_base =
|
|
|
|
post_read_expected_value.GetFinalValueBase();
|
|
|
|
|
|
|
|
const bool during_read_no_write_happened =
|
|
|
|
(pre_read_expected_value_base == post_read_expected_final_value_base);
|
|
|
|
return pre_read_expected_deleted && during_read_no_write_happened;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ExpectedValueHelper::MustHaveExisted(
|
|
|
|
ExpectedValue pre_read_expected_value,
|
|
|
|
ExpectedValue post_read_expected_value) {
|
|
|
|
const bool pre_read_expected_not_deleted =
|
|
|
|
!pre_read_expected_value.IsDeleted();
|
|
|
|
|
|
|
|
const uint32_t pre_read_expected_del_counter =
|
|
|
|
pre_read_expected_value.GetDelCounter();
|
|
|
|
const uint32_t post_read_expected_final_del_counter =
|
|
|
|
post_read_expected_value.GetFinalDelCounter();
|
|
|
|
|
|
|
|
const bool during_read_no_delete_happened =
|
|
|
|
(pre_read_expected_del_counter == post_read_expected_final_del_counter);
|
|
|
|
|
|
|
|
return pre_read_expected_not_deleted && during_read_no_delete_happened;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ExpectedValueHelper::InExpectedValueBaseRange(
|
|
|
|
uint32_t value_base, ExpectedValue pre_read_expected_value,
|
|
|
|
ExpectedValue post_read_expected_value) {
|
|
|
|
assert(ExpectedValue::IsValueBaseValid(value_base));
|
|
|
|
|
|
|
|
const uint32_t pre_read_expected_value_base =
|
|
|
|
pre_read_expected_value.GetValueBase();
|
|
|
|
const uint32_t post_read_expected_final_value_base =
|
|
|
|
post_read_expected_value.GetFinalValueBase();
|
|
|
|
|
|
|
|
if (pre_read_expected_value_base <= post_read_expected_final_value_base) {
|
|
|
|
const uint32_t lower_bound = pre_read_expected_value_base;
|
|
|
|
const uint32_t upper_bound = post_read_expected_final_value_base;
|
|
|
|
return lower_bound <= value_base && value_base <= upper_bound;
|
|
|
|
} else {
|
|
|
|
const uint32_t upper_bound_1 = post_read_expected_final_value_base;
|
|
|
|
const uint32_t lower_bound_2 = pre_read_expected_value_base;
|
|
|
|
const uint32_t upper_bound_2 = ExpectedValue::GetValueBaseMask();
|
|
|
|
return (value_base <= upper_bound_1) ||
|
|
|
|
(lower_bound_2 <= value_base && value_base <= upper_bound_2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
#endif // GFLAGS
|