mirror of https://github.com/facebook/rocksdb.git
Fix crash_test_with_best_efforts_recovery (#11938)
Summary: Thanks ltamasi and ajkr for initial investigations on the test failure. Per the investigations, the following scenario is likely causing the test to fail. 1. Recovery is needed (could be any reason during crash test) 2. Trying to recover from the latest manifest fails (likely due to read error injection) 3. DB opens with recovery from the next manifest which is different from step 2. 4. Expected state is based on the manifest we tried and failed in step 2. 5. Two manifests used in step 2 and 3 are confirmed to have difference in LSM trees (Thanks ltamasi again for the finding). ``` 2023/10/05-11:24:18.942189 56341 [db/version_set.cc:6079] Trying to recover from manifest: /dev/shm/rocksdb_test/rocksdb_crashtest_blackbox/MANIFEST-007184 ... 2023/10/05-11:24:18.978007 56341 [db/version_set.cc:6079] Trying to recover from manifest: /dev/shm/rocksdb_test/rocksdb_crashtest_blackbox/MANIFEST-007180 ``` ``` [ltamasi@devbig1024.prn1 /tmp/x]$ ldb manifest_dump --hex --path=MANIFEST-007184_renamed_ > 2 [ltamasi@devbig1024.prn1 /tmp/x]$ ldb manifest_dump --hex --path=MANIFEST-007180_renamed_ > 1 [ltamasi@devbig1024.prn1 /tmp/x]$ diff 1 2 --- 1 2023-10-09 10:29:16.966215207 -0700 +++ 2 2023-10-09 10:29:11.984241645 -0700 @@ -13,7 +13,7 @@ 7174:3950254[1875617 .. 2203952]['000000000003415B000000000000012B000000000000007D' seq:1906214, type:1 .. '000000000003CA59000000000000012B000000000000005C' seq:2039838, type:1] 7175:88060[2074748 .. 2203892]['000000000003CA6300000000000000CF78787878787878' seq:2167539, type:2 .. '000000000003D08F000000000000012B0000000000000130' seq:2112478, type:0] --- level 6 --- version# 1 --- - 7057:3132633[0 .. 2046144]['0000000000000009000000000000000978' seq:0, type:1 .. '0000000000005F8B000000000000012B00000000000002AC' seq:0, type:1] + 7219:2135565[0 .. 2046144]['0000000000000009000000000000000978' seq:0, type:1 .. '0000000000005F8B000000000000012B00000000000002AC' seq:0, type:1] 7061:827724[0 .. 2046131]['0000000000005F95000000000000000778787878787878' seq:0, type:1 .. '000000000000784F000000000000012B0000000000000113' seq:0, type:1] 6763:1352[0 .. 0]['000000000000784F000000000000012B0000000000000129' seq:0, type:1 .. '000000000000784F000000000000012B0000000000000129' seq:0, type:1] 7173:4812291[0 .. 2203957]['000000000000784F000000000000012B0000000000000138' seq:0, type:1 .. '0000000000020FAE787878787878' seq:0, type:1] @@ -77,4 +77,4 @@ --- level 61 --- version# 1 --- --- level 62 --- version# 1 --- --- level 63 --- version# 1 --- -next_file_number 7182 last_sequence 2203963 prev_log_number 0 max_column_family 0 min_log_number_to_keep 7015 +next_file_number 7221 last_sequence 2203963 prev_log_number 0 max_column_family 0 min_log_number_to_keep 7015 ``` We have two options to fix this. Either skip verification against expected state or disable read injection when BE recovery is enabled. I chose to skip verification against expected state per discussion. (See comments in this PR) Please note that some linter changes were included in this PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11938 Test Plan: ``` TEST_TMPDIR=/dev/shm/rocksdb make crash_test_with_best_efforts_recovery ``` Reviewed By: ltamasi Differential Revision: D50136341 Pulled By: jaykorean fbshipit-source-id: ac7434d592aebc148bfc3a4fcaa34936f136b95c
This commit is contained in:
parent
d367b34cc9
commit
d2daa10afc
|
@ -240,10 +240,10 @@ int db_stress_tool(int argc, char** argv) {
|
|||
FLAGS_secondaries_base = default_secondaries_path;
|
||||
}
|
||||
|
||||
if (FLAGS_best_efforts_recovery && !FLAGS_skip_verifydb &&
|
||||
!FLAGS_disable_wal) {
|
||||
if (FLAGS_best_efforts_recovery &&
|
||||
!(FLAGS_skip_verifydb && FLAGS_disable_wal)) {
|
||||
fprintf(stderr,
|
||||
"With best-efforts recovery, either skip_verifydb or disable_wal "
|
||||
"With best-efforts recovery, skip_verifydb and disable_wal "
|
||||
"should be set to true.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
|
|
@ -160,12 +160,12 @@ default_params = {
|
|||
"sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]),
|
||||
"bytes_per_sync": lambda: random.choice([0, 262144]),
|
||||
"wal_bytes_per_sync": lambda: random.choice([0, 524288]),
|
||||
"compaction_readahead_size" : lambda : random.choice(
|
||||
"compaction_readahead_size": lambda: random.choice(
|
||||
[0, 0, 1024 * 1024]),
|
||||
"db_write_buffer_size": lambda: random.choice(
|
||||
[0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]
|
||||
),
|
||||
"use_write_buffer_manager": lambda: random.randint(0,1),
|
||||
"use_write_buffer_manager": lambda: random.randint(0, 1),
|
||||
"avoid_unnecessary_blocking_io": random.randint(0, 1),
|
||||
"write_dbid_to_manifest": random.randint(0, 1),
|
||||
"avoid_flush_during_recovery": lambda: random.choice(
|
||||
|
@ -221,7 +221,9 @@ default_params = {
|
|||
"preserve_internal_time_seconds": lambda: random.choice([0, 60, 3600, 36000]),
|
||||
"memtable_max_range_deletions": lambda: random.choice([0] * 6 + [100, 1000]),
|
||||
# 0 (disable) is the default and more commonly used value.
|
||||
"bottommost_file_compaction_delay": lambda: random.choice([0, 0, 0, 600, 3600, 86400]),
|
||||
"bottommost_file_compaction_delay": lambda: random.choice(
|
||||
[0, 0, 0, 600, 3600, 86400]
|
||||
),
|
||||
"auto_readahead_size" : lambda: random.choice([0, 1]),
|
||||
}
|
||||
|
||||
|
@ -423,6 +425,8 @@ best_efforts_recovery_params = {
|
|||
"atomic_flush": 0,
|
||||
"disable_wal": 1,
|
||||
"column_families": 1,
|
||||
"skip_verifydb": 1,
|
||||
"verify_db_one_in": 0
|
||||
}
|
||||
|
||||
blob_params = {
|
||||
|
@ -664,6 +668,8 @@ def finalize_and_sanitize(src_params):
|
|||
dest_params["enable_compaction_filter"] = 0
|
||||
dest_params["sync"] = 0
|
||||
dest_params["write_fault_one_in"] = 0
|
||||
dest_params["skip_verifydb"] = 1
|
||||
dest_params["verify_db_one_in"] = 0
|
||||
# Remove the following once write-prepared/write-unprepared with/without
|
||||
# unordered write supports timestamped snapshots
|
||||
if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0:
|
||||
|
@ -774,7 +780,7 @@ def gen_cmd(params, unknown_params):
|
|||
"stress_cmd",
|
||||
"test_tiered_storage",
|
||||
"cleanup_cmd",
|
||||
"skip_tmpdir_check"
|
||||
"skip_tmpdir_check",
|
||||
}
|
||||
and v is not None
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue