From 20dbf512473b570c62b83258c058eff448779601 Mon Sep 17 00:00:00 2001 From: Jay Huh Date: Mon, 18 Sep 2023 23:04:38 -0700 Subject: [PATCH] DB Stress Fix - Commit pending value after checking for status (#11856) Summary: We've seen occasional crash test failure in optimistic transaction DB with the following error message. ``` stderr: WARNING: prefix_size is non-zero but memtablerep != prefix_hash Verification failed for column family 0 key 0000000000001EDB0000000000000036787878787878 (789064): value_from_db: 010000000504070609080B0A0D0C0F0E111013121514171619181B1A1D1C1F1E212023222524272629282B2A2D2C2F2E313033323534373639383B3A3D3C3F3E, value_from_expected: , msg: MultiGet verification: Unexpected value found Crash-recovery verification failed :( No writes or ops? Verification failed :( ``` There was a possibility if the db stress crashes after `pending_expected_value.Commit()` but before `thread->shared->SafeTerminate();`, we may have expected value committed while actual DB value was not. Moving the `pending_expected_value.Commit()` after `s.ok()` check to fix the test. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11856 Test Plan: Ran the following in a script with while loop. (It doesn't always repro the issue even without this fix, though..) ``` ./db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=0 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=8 --bloom_bits=11 --bottommost_compression_type=lz4 --bottommost_file_compaction_delay=0 --bytes_per_sync=0 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=1000000 --checksum_type=kxxHash --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=4 --compaction_readahead_size=0 --compaction_ttl=100 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4 --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_blackbox --db_write_buffer_size=1048576 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --enable_thread_tracking=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=1 --fifo_allow_compaction=1 --file_checksum_impl=crc32c --flush_one_in=1000000 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=3 --index_type=2 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=1000000 --long_running_snapshots=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=524288 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=25000000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=2097152 --memtable_max_range_deletions=0 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=2 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=1 --occ_lock_bucket_count=500 --occ_validation_policy=0 --open_files=-1 --open_metadata_write_fault_one_in=8 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=1000000 --periodic_compaction_seconds=2 --prefix_size=5 --prefixpercent=5 --prepopulate_block_cache=1 --preserve_internal_time_seconds=36000 --progress_reports=0 --read_fault_one_in=0 --readahead_size=0 --readpercent=45 --recycle_log_file_num=1 --reopen=0 --secondary_cache_fault_one_in=0 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=10000 --share_occ_lock_buckets=0 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=0 --subcompactions=3 --sync=0 --sync_fault_injection=0 --target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=3 --unpartitioned_pinning=2 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_get_entity=0 --use_multiget=1 --use_optimistic_txn=1 --use_put_entity_one_in=0 --use_txn=1 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_file_checksums_one_in=1000000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=4194304 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=35& ``` ``` pid=$! sleep 10 kill -9 $pid sleep 1 ``` ``` db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=0 --atomic_flush=1 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=75.01353068032098 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=3600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=0 --checkpoint_one_in=1000000 --checksum_type=kCRC32c --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=4 --compaction_readahead_size=0 --compaction_ttl=0 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=xpress --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=1 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_blackbox --db_write_buffer_size=134217728 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --enable_thread_tracking=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=1 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=1000000 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=10 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=1 --lock_wal_one_in=1000000 --long_running_snapshots=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=524288 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=25000000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=1048576 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=1048576 --memtable_max_range_deletions=100 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=8 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=True --nooverwritepercent=1 --num_file_reads_for_auto_readahead=2 --occ_lock_bucket_count=10 --occ_validation_policy=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=32 --open_write_fault_one_in=16 --ops_per_thread=100000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=0 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=1 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=36000 --progress_reports=0 --read_fault_one_in=32 --readahead_size=0 --readpercent=45 --recycle_log_file_num=1 --reopen=0 --secondary_cache_fault_one_in=0 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=10000 --share_occ_lock_buckets=0 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=0 --subcompactions=4 --sync=0 --sync_fault_injection=0 --target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=1 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=1 --use_direct_reads=0 --use_full_merge_v1=1 --use_get_entity=0 --use_merge=1 --use_multi_get_entity=0 --use_multiget=1 --use_optimistic_txn=1 --use_put_entity_one_in=0 --use_txn=1 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=zstd --write_buffer_size=4194304 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=35 ``` Reviewed By: cbi42 Differential Revision: D49403091 Pulled By: jaykorean fbshipit-source-id: 5ee6136133bbdc46aa733e5101c1f998f658c200 --- db_stress_tool/no_batched_ops_stress.cc | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 321406a2fc..92d0c9fca9 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -1309,8 +1309,6 @@ class NonBatchedOpsStressTest : public StressTest { } } - pending_expected_value.Commit(); - if (!s.ok()) { if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { @@ -1325,7 +1323,7 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } - + pending_expected_value.Commit(); thread->stats.AddBytesForWrites(1, sz); PrintKeyValue(rand_column_family, static_cast(rand_key), value, sz); @@ -1367,9 +1365,7 @@ class NonBatchedOpsStressTest : public StressTest { return txn.Delete(cfh, key); }); } - pending_expected_value.Commit(); - thread->stats.AddDeletes(1); if (!s.ok()) { if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && @@ -1385,6 +1381,8 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } + pending_expected_value.Commit(); + thread->stats.AddDeletes(1); } else { PendingExpectedValue pending_expected_value = shared->PrepareSingleDelete(rand_column_family, rand_key); @@ -1399,8 +1397,7 @@ class NonBatchedOpsStressTest : public StressTest { return txn.SingleDelete(cfh, key); }); } - pending_expected_value.Commit(); - thread->stats.AddSingleDeletes(1); + if (!s.ok()) { if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && @@ -1416,6 +1413,8 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->SafeTerminate(); } } + pending_expected_value.Commit(); + thread->stats.AddSingleDeletes(1); } return s; }