From e90e9153d5169f9df8ddd3cf0bf693572768a7b5 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Sun, 23 Jun 2024 21:54:27 -0700 Subject: [PATCH] Calculate `injected_error_count` even when `SharedState::ignore_read_error` (#12800) Summary: **Context/Summary:** `injected_error_count` is needed to verify read error injection. For example, when injected_error_count == 0, the read call should not return error. https://github.com/facebook/rocksdb/commit/981fd432fa2441fc10a59a462bd14906ccb1c0e0 only calculated `injected_error_count` under `SharedState::ignore_read_error=false` so `injected_error_count==0` when `SharedState::ignore_read_error=true`. However we can still inject read error in critical read path under `SharedState::ignore_read_error=true` so the read call is expected to return injected error. This contradicts to the `injected_error_count == 0` as we skipped its calculation. As a consequence, we see ``` TestPrefixScan error: IO error: injected read error; Verification failed ``` in code paths ``` if (s.ok()) { thread->stats.AddPrefixes(1, count); } else if (injected_error_count > 0 && IsRetryableInjectedError(s)) { fprintf(stdout, "TestPrefixScan error: %s\n", s.ToString().c_str()); } else { fprintf(stderr, "TestPrefixScan error: %s\n", s.ToString().c_str()); thread->shared->SetVerificationFailure(); } ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/12800 Test Plan: CI Reviewed By: cbi42 Differential Revision: D58918014 Pulled By: hx235 fbshipit-source-id: d73139c114fb3f61003dedca116f7ec36309eca4 --- db_stress_tool/no_batched_ops_stress.cc | 41 +++++++++++++++---------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 6bb454abbc..e95352a5e4 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -538,13 +538,14 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->Get(rand_column_families[0], rand_keys[0]); int injected_error_count = 0; - if (fault_fs_guard && !SharedState::ignore_read_error) { + if (fault_fs_guard) { injected_error_count = GetMinInjectedErrorCount( fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kRead), fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kMetadataRead)); - if (injected_error_count > 0 && (s.ok() || s.IsNotFound())) { + if (!SharedState::ignore_read_error && injected_error_count > 0 && + (s.ok() || s.IsNotFound())) { // Grab mutex so multiple thread don't try to print the // stack trace at the same time MutexLock l(thread->shared->GetMutex()); @@ -692,7 +693,7 @@ class NonBatchedOpsStressTest : public StressTest { } db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), statuses.data()); - if (fault_fs_guard && !SharedState::ignore_read_error) { + if (fault_fs_guard) { injected_error_count = GetMinInjectedErrorCount( fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kRead), @@ -706,7 +707,8 @@ class NonBatchedOpsStressTest : public StressTest { stat_nok_nfound++; } } - if (stat_nok_nfound < injected_error_count) { + if (!SharedState::ignore_read_error && + stat_nok_nfound < injected_error_count) { // Grab mutex so multiple thread don't try to print the // stack trace at the same time MutexLock l(shared->GetMutex()); @@ -1001,13 +1003,14 @@ class NonBatchedOpsStressTest : public StressTest { thread->shared->Get(column_family, key); int injected_error_count = 0; - if (fault_fs_guard && !SharedState::ignore_read_error) { + if (fault_fs_guard) { injected_error_count = GetMinInjectedErrorCount( fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kRead), fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kMetadataRead)); - if (injected_error_count > 0 && (s.ok() || s.IsNotFound())) { + if (!SharedState::ignore_read_error && injected_error_count > 0 && + (s.ok() || s.IsNotFound())) { // Grab mutex so multiple thread don't try to print the // stack trace at the same time MutexLock l(thread->shared->GetMutex()); @@ -1168,7 +1171,8 @@ class NonBatchedOpsStressTest : public StressTest { } } - if (stat_nok_nfound < injected_error_count) { + if (!SharedState::ignore_read_error && + stat_nok_nfound < injected_error_count) { // Grab mutex so multiple threads don't try to print the // stack trace at the same time assert(thread->shared); @@ -1401,7 +1405,7 @@ class NonBatchedOpsStressTest : public StressTest { db_->MultiGetEntity(read_opts_copy, num_keys, key_slices.data(), results.data()); - if (fault_fs_guard && !SharedState::ignore_read_error) { + if (fault_fs_guard) { verify_expected_errors( [&](size_t i) { return results[i][0].status(); }); } @@ -1431,7 +1435,7 @@ class NonBatchedOpsStressTest : public StressTest { db_->MultiGetEntity(read_opts_copy, cfh, num_keys, key_slices.data(), results.data(), statuses.data()); - if (fault_fs_guard && !SharedState::ignore_read_error) { + if (fault_fs_guard) { verify_expected_errors([&](size_t i) { return statuses[i]; }); } @@ -1478,11 +1482,6 @@ class NonBatchedOpsStressTest : public StressTest { MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice, ro_copy); - std::unique_ptr iter(db_->NewIterator(ro_copy, cfh)); - - uint64_t count = 0; - Status s; - if (fault_fs_guard) { fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kRead); @@ -1491,6 +1490,11 @@ class NonBatchedOpsStressTest : public StressTest { SharedState::ignore_read_error = false; } + std::unique_ptr iter(db_->NewIterator(ro_copy, cfh)); + + uint64_t count = 0; + Status s; + for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { ++count; @@ -1522,13 +1526,14 @@ class NonBatchedOpsStressTest : public StressTest { } int injected_error_count = 0; - if (fault_fs_guard && !SharedState::ignore_read_error) { + if (fault_fs_guard) { injected_error_count = GetMinInjectedErrorCount( fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kRead), fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount( FaultInjectionIOType::kMetadataRead)); - if (injected_error_count > 0 && s.ok()) { + if (!SharedState::ignore_read_error && injected_error_count > 0 && + s.ok()) { // Grab mutex so multiple thread don't try to print the // stack trace at the same time MutexLock l(thread->shared->GetMutex()); @@ -1547,6 +1552,10 @@ class NonBatchedOpsStressTest : public StressTest { } else if (injected_error_count > 0 && IsRetryableInjectedError(s)) { fprintf(stdout, "TestPrefixScan error: %s\n", s.ToString().c_str()); } else { + fprintf(stderr, "TestPrefixScan is retryable error? %s\n", + IsRetryableInjectedError(s) ? "true" : "false"); + fprintf(stderr, "TestPrefixScan injected_error_count: %d\n", + injected_error_count); fprintf(stderr, "TestPrefixScan error: %s\n", s.ToString().c_str()); thread->shared->SetVerificationFailure(); }