2021-09-28 21:12:23 +00:00
|
|
|
// Copyright (c) 2021-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#ifdef GFLAGS
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#include <atomic>
|
|
|
|
#include <memory>
|
|
|
|
|
2021-12-07 21:40:46 +00:00
|
|
|
#include "db/dbformat.h"
|
2023-05-18 16:44:15 +00:00
|
|
|
#include "db_stress_tool/expected_value.h"
|
2021-12-07 21:40:46 +00:00
|
|
|
#include "file/file_util.h"
|
|
|
|
#include "rocksdb/db.h"
|
2021-09-28 21:12:23 +00:00
|
|
|
#include "rocksdb/env.h"
|
2021-12-07 21:40:46 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
2021-09-28 21:12:23 +00:00
|
|
|
#include "rocksdb/rocksdb_namespace.h"
|
2021-12-07 21:40:46 +00:00
|
|
|
#include "rocksdb/types.h"
|
|
|
|
#include "util/string_util.h"
|
2021-09-28 21:12:23 +00:00
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2023-05-18 16:44:15 +00:00
|
|
|
// `ExpectedState` provides read/write access to expected values stored in
|
|
|
|
// `ExpectedState` for every key.
|
2021-09-28 21:12:23 +00:00
|
|
|
class ExpectedState {
|
|
|
|
public:
|
|
|
|
explicit ExpectedState(size_t max_key, size_t num_column_families);
|
|
|
|
|
|
|
|
virtual ~ExpectedState() {}
|
|
|
|
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
virtual Status Open(bool create) = 0;
|
|
|
|
|
|
|
|
// Requires external locking covering all keys in `cf`.
|
|
|
|
void ClearColumnFamily(int cf);
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Prepare a Put that will be started but not finished yet
|
|
|
|
// This is useful for crash-recovery testing when the process may crash
|
|
|
|
// before updating the corresponding expected value
|
2021-09-28 21:12:23 +00:00
|
|
|
//
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Requires external locking covering `key` in `cf` to prevent concurrent
|
|
|
|
// write or delete to the same `key`.
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PendingExpectedValue PreparePut(int cf, int64_t key);
|
2021-09-28 21:12:23 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Does not requires external locking.
|
|
|
|
ExpectedValue Get(int cf, int64_t key);
|
2021-09-28 21:12:23 +00:00
|
|
|
|
2024-07-29 20:51:49 +00:00
|
|
|
// Prepare a Delete that will be started but not finished yet.
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// This is useful for crash-recovery testing when the process may crash
|
|
|
|
// before updating the corresponding expected value
|
2021-09-28 21:12:23 +00:00
|
|
|
//
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Requires external locking covering `key` in `cf` to prevent concurrent
|
|
|
|
// write or delete to the same `key`.
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PendingExpectedValue PrepareDelete(int cf, int64_t key);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
|
|
|
|
// Requires external locking covering `key` in `cf` to prevent concurrent
|
|
|
|
// write or delete to the same `key`.
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
|
|
|
|
// Requires external locking covering keys in `[begin_key, end_key)` in `cf`
|
|
|
|
// to prevent concurrent write or delete to the same `key`.
|
|
|
|
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
|
|
|
|
int64_t begin_key,
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
int64_t end_key);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
|
|
|
|
// Update the expected value for start of an incomplete write or delete
|
|
|
|
// operation on the key assoicated with this expected value
|
|
|
|
void Precommit(int cf, int64_t key, const ExpectedValue& value);
|
|
|
|
|
|
|
|
// Requires external locking covering `key` in `cf` to prevent concurrent
|
|
|
|
// delete to the same `key`.
|
|
|
|
bool Exists(int cf, int64_t key);
|
2021-09-28 21:12:23 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Sync the `value_base` to the corresponding expected value
|
2021-09-28 21:12:23 +00:00
|
|
|
//
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Requires external locking covering `key` in `cf` or be in single thread
|
|
|
|
// to prevent concurrent write or delete to the same `key`
|
|
|
|
void SyncPut(int cf, int64_t key, uint32_t value_base);
|
2021-09-28 21:12:23 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Sync the corresponding expected value to be pending Put
|
2021-09-28 21:12:23 +00:00
|
|
|
//
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Requires external locking covering `key` in `cf` or be in single thread
|
|
|
|
// to prevent concurrent write or delete to the same `key`
|
|
|
|
void SyncPendingPut(int cf, int64_t key);
|
2021-09-28 21:12:23 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Sync the corresponding expected value to be deleted
|
|
|
|
//
|
|
|
|
// Requires external locking covering `key` in `cf` or be in single thread
|
|
|
|
// to prevent concurrent write or delete to the same `key`
|
|
|
|
void SyncDelete(int cf, int64_t key);
|
|
|
|
|
|
|
|
// Sync the corresponding expected values to be deleted
|
|
|
|
//
|
|
|
|
// Requires external locking covering keys in `[begin_key, end_key)` in `cf`
|
|
|
|
// to prevent concurrent write or delete to the same `key`
|
|
|
|
void SyncDeleteRange(int cf, int64_t begin_key, int64_t end_key);
|
2021-09-28 21:12:23 +00:00
|
|
|
|
|
|
|
private:
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Does not requires external locking.
|
2021-09-28 21:12:23 +00:00
|
|
|
std::atomic<uint32_t>& Value(int cf, int64_t key) const {
|
|
|
|
return values_[cf * max_key_ + key];
|
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// Does not requires external locking
|
|
|
|
ExpectedValue Load(int cf, int64_t key) const {
|
|
|
|
return ExpectedValue(Value(cf, key).load());
|
|
|
|
}
|
|
|
|
|
2021-09-28 21:12:23 +00:00
|
|
|
const size_t max_key_;
|
|
|
|
const size_t num_column_families_;
|
|
|
|
|
|
|
|
protected:
|
|
|
|
size_t GetValuesLen() const {
|
|
|
|
return sizeof(std::atomic<uint32_t>) * num_column_families_ * max_key_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
void Reset();
|
|
|
|
|
|
|
|
std::atomic<uint32_t>* values_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// A `FileExpectedState` implements `ExpectedState` backed by a file.
|
|
|
|
class FileExpectedState : public ExpectedState {
|
|
|
|
public:
|
|
|
|
explicit FileExpectedState(std::string expected_state_file_path,
|
|
|
|
size_t max_key, size_t num_column_families);
|
|
|
|
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
Status Open(bool create) override;
|
|
|
|
|
|
|
|
private:
|
|
|
|
const std::string expected_state_file_path_;
|
|
|
|
std::unique_ptr<MemoryMappedFileBuffer> expected_state_mmap_buffer_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// An `AnonExpectedState` implements `ExpectedState` backed by a memory
|
|
|
|
// allocation.
|
|
|
|
class AnonExpectedState : public ExpectedState {
|
|
|
|
public:
|
|
|
|
explicit AnonExpectedState(size_t max_key, size_t num_column_families);
|
|
|
|
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
Status Open(bool create) override;
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// An `ExpectedStateManager` manages data about the expected state of the
|
|
|
|
// database. It exposes operations for reading and modifying the latest
|
|
|
|
// expected state.
|
|
|
|
class ExpectedStateManager {
|
|
|
|
public:
|
|
|
|
explicit ExpectedStateManager(size_t max_key, size_t num_column_families);
|
|
|
|
|
|
|
|
virtual ~ExpectedStateManager();
|
|
|
|
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
virtual Status Open() = 0;
|
|
|
|
|
2021-12-07 21:40:46 +00:00
|
|
|
// Saves expected values for the current state of `db` and begins tracking
|
2021-12-15 20:53:32 +00:00
|
|
|
// changes. Following a successful `SaveAtAndAfter()`, `Restore()` can be
|
|
|
|
// called on the same DB, as long as its state does not roll back to before
|
|
|
|
// its current state.
|
2021-12-07 21:40:46 +00:00
|
|
|
//
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function. Furthermore, `db` must not be mutated while this function
|
|
|
|
// is executing.
|
|
|
|
virtual Status SaveAtAndAfter(DB* db) = 0;
|
|
|
|
|
2021-12-15 20:53:32 +00:00
|
|
|
// Returns true if at least one state of historical expected values can be
|
|
|
|
// restored.
|
|
|
|
//
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
virtual bool HasHistory() = 0;
|
|
|
|
|
|
|
|
// Restores expected values according to the current state of `db`. See
|
|
|
|
// `SaveAtAndAfter()` for conditions where this can be called.
|
|
|
|
//
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function. Furthermore, `db` must not be mutated while this function
|
|
|
|
// is executing.
|
|
|
|
virtual Status Restore(DB* db) = 0;
|
|
|
|
|
2021-09-28 21:12:23 +00:00
|
|
|
// Requires external locking covering all keys in `cf`.
|
|
|
|
void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::PreparePut()
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PendingExpectedValue PreparePut(int cf, int64_t key) {
|
|
|
|
return latest_->PreparePut(cf, key);
|
2021-09-28 21:12:23 +00:00
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::Get()
|
|
|
|
ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); }
|
2021-09-28 21:12:23 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::PrepareDelete()
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PendingExpectedValue PrepareDelete(int cf, int64_t key) {
|
|
|
|
return latest_->PrepareDelete(cf, key);
|
2021-09-28 21:12:23 +00:00
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::PrepareSingleDelete()
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
|
|
|
|
return latest_->PrepareSingleDelete(cf, key);
|
2021-09-28 21:12:23 +00:00
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::PrepareDeleteRange()
|
|
|
|
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
|
|
|
|
int64_t begin_key,
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
int64_t end_key) {
|
|
|
|
return latest_->PrepareDeleteRange(cf, begin_key, end_key);
|
2021-09-28 21:12:23 +00:00
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::Exists()
|
2021-09-28 21:12:23 +00:00
|
|
|
bool Exists(int cf, int64_t key) { return latest_->Exists(cf, key); }
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// See ExpectedState::SyncPut()
|
|
|
|
void SyncPut(int cf, int64_t key, uint32_t value_base) {
|
|
|
|
return latest_->SyncPut(cf, key, value_base);
|
|
|
|
}
|
|
|
|
|
|
|
|
// See ExpectedState::SyncPendingPut()
|
|
|
|
void SyncPendingPut(int cf, int64_t key) {
|
|
|
|
return latest_->SyncPendingPut(cf, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
// See ExpectedState::SyncDelete()
|
|
|
|
void SyncDelete(int cf, int64_t key) { return latest_->SyncDelete(cf, key); }
|
|
|
|
|
|
|
|
// See ExpectedState::SyncDeleteRange()
|
|
|
|
void SyncDeleteRange(int cf, int64_t begin_key, int64_t end_key) {
|
|
|
|
return latest_->SyncDeleteRange(cf, begin_key, end_key);
|
|
|
|
}
|
|
|
|
|
2021-09-28 21:12:23 +00:00
|
|
|
protected:
|
|
|
|
const size_t max_key_;
|
|
|
|
const size_t num_column_families_;
|
|
|
|
std::unique_ptr<ExpectedState> latest_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// A `FileExpectedStateManager` implements an `ExpectedStateManager` backed by
|
|
|
|
// a directory of files containing data about the expected state of the
|
|
|
|
// database.
|
|
|
|
class FileExpectedStateManager : public ExpectedStateManager {
|
|
|
|
public:
|
|
|
|
explicit FileExpectedStateManager(size_t max_key, size_t num_column_families,
|
|
|
|
std::string expected_state_dir_path);
|
|
|
|
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
Status Open() override;
|
|
|
|
|
2021-12-07 21:40:46 +00:00
|
|
|
// See `ExpectedStateManager::SaveAtAndAfter()` API doc.
|
|
|
|
//
|
|
|
|
// This implementation makes a copy of "LATEST.state" into
|
|
|
|
// "<current seqno>.state", and starts a trace in "<current seqno>.trace".
|
2021-12-15 20:53:32 +00:00
|
|
|
// Due to using external files, a following `Restore()` can happen even
|
|
|
|
// from a different process.
|
2021-12-07 21:40:46 +00:00
|
|
|
Status SaveAtAndAfter(DB* db) override;
|
|
|
|
|
2021-12-15 20:53:32 +00:00
|
|
|
// See `ExpectedStateManager::HasHistory()` API doc.
|
|
|
|
bool HasHistory() override;
|
|
|
|
|
|
|
|
// See `ExpectedStateManager::Restore()` API doc.
|
|
|
|
//
|
|
|
|
// Say `db->GetLatestSequenceNumber()` was `a` last time `SaveAtAndAfter()`
|
|
|
|
// was called and now it is `b`. Then this function replays `b - a` write
|
|
|
|
// operations from "`a`.trace" onto "`a`.state", and then copies the resulting
|
|
|
|
// file into "LATEST.state".
|
|
|
|
Status Restore(DB* db) override;
|
|
|
|
|
2021-09-28 21:12:23 +00:00
|
|
|
private:
|
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
Status Clean();
|
|
|
|
|
|
|
|
std::string GetTempPathForFilename(const std::string& filename);
|
|
|
|
std::string GetPathForFilename(const std::string& filename);
|
|
|
|
|
2021-12-07 21:40:46 +00:00
|
|
|
static const std::string kLatestBasename;
|
|
|
|
static const std::string kStateFilenameSuffix;
|
|
|
|
static const std::string kTraceFilenameSuffix;
|
|
|
|
static const std::string kTempFilenamePrefix;
|
|
|
|
static const std::string kTempFilenameSuffix;
|
2021-09-28 21:12:23 +00:00
|
|
|
|
|
|
|
const std::string expected_state_dir_path_;
|
2021-12-07 21:40:46 +00:00
|
|
|
SequenceNumber saved_seqno_ = kMaxSequenceNumber;
|
2021-09-28 21:12:23 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// An `AnonExpectedStateManager` implements an `ExpectedStateManager` backed by
|
|
|
|
// a memory allocation containing data about the expected state of the database.
|
|
|
|
class AnonExpectedStateManager : public ExpectedStateManager {
|
|
|
|
public:
|
|
|
|
explicit AnonExpectedStateManager(size_t max_key, size_t num_column_families);
|
|
|
|
|
2021-12-07 21:40:46 +00:00
|
|
|
// See `ExpectedStateManager::SaveAtAndAfter()` API doc.
|
|
|
|
//
|
|
|
|
// This implementation returns `Status::NotSupported` since we do not
|
|
|
|
// currently have a need to keep history of expected state within a process.
|
|
|
|
Status SaveAtAndAfter(DB* /* db */) override {
|
|
|
|
return Status::NotSupported();
|
|
|
|
}
|
|
|
|
|
2021-12-15 20:53:32 +00:00
|
|
|
// See `ExpectedStateManager::HasHistory()` API doc.
|
|
|
|
bool HasHistory() override { return false; }
|
|
|
|
|
|
|
|
// See `ExpectedStateManager::Restore()` API doc.
|
|
|
|
//
|
|
|
|
// This implementation returns `Status::NotSupported` since we do not
|
|
|
|
// currently have a need to keep history of expected state within a process.
|
|
|
|
Status Restore(DB* /* db */) override { return Status::NotSupported(); }
|
|
|
|
|
2021-09-28 21:12:23 +00:00
|
|
|
// Requires external locking preventing concurrent execution with any other
|
|
|
|
// member function.
|
|
|
|
Status Open() override;
|
|
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
#endif // GFLAGS
|