db_stress fault injection in release mode (#9957)

Summary:
Previously all fault injection was ignored in release mode. This PR adds it back except for read fault injection (`--read_fault_one_in > 0`) since its dependency (`IGNORE_STATUS_IF_ERROR`) is unavailable in release mode.

Other notable changes include:

- Moved `EnableWriteErrorInjection()` for `--write_fault_one_in > 0` so it's after `DB::Open()` without depending on `SyncPoint`
- Made `--read_fault_one_in > 0` return an error in release mode
- Updated `db_crashtest.py` to always set `--read_fault_one_in=0` in release mode

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9957

Test Plan:
```
$ DEBUG_LEVEL=0 make -j24 db_stress
$ DEBUG_LEVEL=0 TEST_TMPDIR=/dev/shm python3 tools/db_crashtest.py blackbox
```

Reviewed By: anand1976

Differential Revision: D36193830

Pulled By: ajkr

fbshipit-source-id: 0b97946b4e3f06e3e0f6e7833c2763da08ec5321
This commit is contained in:
Andrew Kryczka 2022-05-06 11:17:08 -07:00 committed by Facebook GitHub Bot
parent b7aaa98762
commit 62d84e2a2b
9 changed files with 18 additions and 47 deletions

View File

@ -18,11 +18,9 @@
ROCKSDB_NAMESPACE::Env* db_stress_listener_env = nullptr;
ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr;
#ifndef NDEBUG
// If non-null, injects read error at a rate specified by the
// read_fault_one_in or write_fault_one_in flag
std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
#endif // NDEBUG
enum ROCKSDB_NAMESPACE::CompressionType compression_type_e =
ROCKSDB_NAMESPACE::kSnappyCompression;
enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e =

View File

@ -67,6 +67,7 @@
#include "util/random.h"
#include "util/string_util.h"
#include "utilities/blob_db/blob_db.h"
#include "utilities/fault_injection_fs.h"
#include "utilities/merge_operators.h"
using GFLAGS_NAMESPACE::ParseCommandLineFlags;
@ -296,12 +297,7 @@ constexpr int kValueMaxLen = 100;
// wrapped posix environment
extern ROCKSDB_NAMESPACE::Env* db_stress_env;
extern ROCKSDB_NAMESPACE::Env* db_stress_listener_env;
#ifndef NDEBUG
namespace ROCKSDB_NAMESPACE {
class FaultInjectionTestFS;
} // namespace ROCKSDB_NAMESPACE
extern std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
#endif
extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e;
extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e;

View File

@ -62,11 +62,12 @@ bool RunStressTest(StressTest* stress) {
stress->InitDb(&shared);
stress->FinishInitDb(&shared);
#ifndef NDEBUG
if (FLAGS_sync_fault_injection) {
fault_fs_guard->SetFilesystemDirectWritable(false);
}
#endif
if (FLAGS_write_fault_one_in) {
fault_fs_guard->EnableWriteErrorInjection();
}
uint32_t n = FLAGS_threads;
uint64_t now = clock->NowMicros();

View File

@ -133,13 +133,21 @@ class SharedState {
for (int i = 0; i < FLAGS_column_families; ++i) {
key_locks_[i].reset(new port::Mutex[num_locks]);
}
#ifndef NDEBUG
if (FLAGS_read_fault_one_in) {
#ifdef NDEBUG
// Unsupported in release mode because it relies on
// `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to
// failure.
fprintf(stderr,
"Cannot set nonzero value for --read_fault_one_in in "
"release mode.");
exit(1);
#else // NDEBUG
SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError",
IgnoreReadErrorCallback);
SyncPoint::GetInstance()->EnableProcessing();
#endif // NDEBUG
}
#endif // NDEBUG
}
~SharedState() {

View File

@ -674,6 +674,7 @@ void StressTest::OperateDb(ThreadState* thread) {
fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(),
FLAGS_read_fault_one_in);
}
#endif // NDEBUG
if (FLAGS_write_fault_one_in) {
IOStatus error_msg;
if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) {
@ -691,7 +692,6 @@ void StressTest::OperateDb(ThreadState* thread) {
thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg,
/*inject_for_all_file_types=*/false, types);
}
#endif // NDEBUG
thread->stats.Start();
for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
if (thread->shared->HasVerificationFailedYet() ||
@ -2621,7 +2621,6 @@ void StressTest::Open(SharedState* shared) {
RegisterAdditionalListeners();
options_.create_missing_column_families = true;
if (!FLAGS_use_txn) {
#ifndef NDEBUG
// Determine whether we need to ingest file metadata write failures
// during DB reopen. If it does, enable it.
// Only ingest metadata error if it is reopening, as initial open
@ -2663,7 +2662,6 @@ void StressTest::Open(SharedState* shared) {
}
}
while (true) {
#endif // NDEBUG
#ifndef ROCKSDB_LITE
// StackableDB-based BlobDB
if (FLAGS_use_blob_db) {
@ -2693,7 +2691,6 @@ void StressTest::Open(SharedState* shared) {
}
}
#ifndef NDEBUG
if (ingest_meta_error || ingest_write_error || ingest_read_error) {
fault_fs_guard->SetFilesystemDirectWritable(true);
fault_fs_guard->DisableMetadataWriteErrorInjection();
@ -2705,7 +2702,7 @@ void StressTest::Open(SharedState* shared) {
// wait for all compactions to finish to make sure DB is in
// clean state before executing queries.
s = static_cast_with_check<DBImpl>(db_->GetRootDB())
->TEST_WaitForCompact(true);
->WaitForCompact(true /* wait_unscheduled */);
if (!s.ok()) {
for (auto cf : column_families_) {
delete cf;
@ -2738,7 +2735,6 @@ void StressTest::Open(SharedState* shared) {
}
break;
}
#endif // NDEBUG
} else {
#ifndef ROCKSDB_LITE
TransactionDBOptions txn_db_options;

View File

@ -24,9 +24,7 @@
#include "db_stress_tool/db_stress_common.h"
#include "db_stress_tool/db_stress_driver.h"
#include "rocksdb/convenience.h"
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif
namespace ROCKSDB_NAMESPACE {
namespace {
@ -82,7 +80,6 @@ int db_stress_tool(int argc, char** argv) {
dbsl_env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
db_stress_listener_env = dbsl_env_wrapper_guard.get();
#ifndef NDEBUG
if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection ||
FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in ||
FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) {
@ -98,18 +95,10 @@ int db_stress_tool(int argc, char** argv) {
std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
raw_env = fault_env_guard.get();
}
if (FLAGS_write_fault_one_in) {
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeFinishBuildTable",
[&](void*) { fault_fs_guard->EnableWriteErrorInjection(); });
SyncPoint::GetInstance()->EnableProcessing();
}
#endif
env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
db_stress_env = env_wrapper_guard.get();
#ifndef NDEBUG
if (FLAGS_write_fault_one_in) {
// In the write injection case, we need to use the FS interface and returns
// the IOStatus with different error and flags. Therefore,
@ -118,7 +107,6 @@ int db_stress_tool(int argc, char** argv) {
// CompositeEnvWrapper of env and fault_fs.
db_stress_env = raw_env;
}
#endif
FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());

View File

@ -12,9 +12,7 @@
#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/defer.h"
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif // NDEBUG
#include "utilities/transactions/write_prepared_txn_db.h"
namespace ROCKSDB_NAMESPACE {

View File

@ -9,9 +9,7 @@
#ifdef GFLAGS
#include "db_stress_tool/db_stress_common.h"
#ifndef NDEBUG
#include "utilities/fault_injection_fs.h"
#endif // NDEBUG
#include "rocksdb/utilities/transaction_db.h"
namespace ROCKSDB_NAMESPACE {
@ -234,20 +232,15 @@ class NonBatchedOpsStressTest : public StressTest {
std::string from_db;
int error_count = 0;
#ifndef NDEBUG
if (fault_fs_guard) {
fault_fs_guard->EnableErrorInjection();
SharedState::ignore_read_error = false;
}
#endif // NDEBUG
Status s = db_->Get(read_opts, cfh, key, &from_db);
#ifndef NDEBUG
if (fault_fs_guard) {
error_count = fault_fs_guard->GetAndResetErrorCount();
}
#endif // NDEBUG
if (s.ok()) {
#ifndef NDEBUG
if (fault_fs_guard) {
if (error_count && !SharedState::ignore_read_error) {
// Grab mutex so multiple thread don't try to print the
@ -259,7 +252,6 @@ class NonBatchedOpsStressTest : public StressTest {
std::terminate();
}
}
#endif // NDEBUG
// found case
thread->stats.AddGets(1, 1);
} else if (s.IsNotFound()) {
@ -273,11 +265,9 @@ class NonBatchedOpsStressTest : public StressTest {
thread->stats.AddVerifiedErrors(1);
}
}
#ifndef NDEBUG
if (fault_fs_guard) {
fault_fs_guard->DisableErrorInjection();
}
#endif // NDEBUG
return s;
}
@ -365,19 +355,15 @@ class NonBatchedOpsStressTest : public StressTest {
}
if (!use_txn) {
#ifndef NDEBUG
if (fault_fs_guard) {
fault_fs_guard->EnableErrorInjection();
SharedState::ignore_read_error = false;
}
#endif // NDEBUG
db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
statuses.data());
#ifndef NDEBUG
if (fault_fs_guard) {
error_count = fault_fs_guard->GetAndResetErrorCount();
}
#endif // NDEBUG
} else {
#ifndef ROCKSDB_LITE
txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
@ -385,7 +371,6 @@ class NonBatchedOpsStressTest : public StressTest {
#endif
}
#ifndef NDEBUG
if (fault_fs_guard && error_count && !SharedState::ignore_read_error) {
int stat_nok = 0;
for (const auto& s : statuses) {
@ -409,7 +394,6 @@ class NonBatchedOpsStressTest : public StressTest {
if (fault_fs_guard) {
fault_fs_guard->DisableErrorInjection();
}
#endif // NDEBUG
for (size_t i = 0; i < statuses.size(); ++i) {
Status s = statuses[i];

View File

@ -415,6 +415,8 @@ multiops_wp_txn_params = {
def finalize_and_sanitize(src_params):
dest_params = dict([(k, v() if callable(v) else v)
for (k, v) in src_params.items()])
if is_release_mode():
dest_params['read_fault_one_in'] = 0
if dest_params.get("compression_max_dict_bytes") == 0:
dest_params["compression_zstd_max_train_bytes"] = 0
dest_params["compression_max_dict_buffer_bytes"] = 0