2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-01-15 18:28:10 +00:00
|
|
|
//
|
|
|
|
// Copyright 2014 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
// This test uses a custom Env to keep track of the state of a filesystem as of
|
|
|
|
// the last "sync". It then checks for data loss errors by purposely dropping
|
|
|
|
// file data (or entire files) not protected by a "sync".
|
|
|
|
|
2019-05-31 22:21:36 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2015-01-15 18:28:10 +00:00
|
|
|
#include "db/log_format.h"
|
|
|
|
#include "db/version_set.h"
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "env/mock_env.h"
|
2019-05-30 03:44:08 +00:00
|
|
|
#include "file/filename.h"
|
2015-01-15 18:28:10 +00:00
|
|
|
#include "rocksdb/cache.h"
|
2021-11-08 19:04:01 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
2015-01-15 18:28:10 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "rocksdb/write_batch.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/mutexlock.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
#include "util/random.h"
|
|
|
|
#include "utilities/fault_injection_env.h"
|
Fix a bug causing duplicate trailing entries in WritableFile (buffered IO) (#9236)
Summary:
`db_stress` is a user of `FaultInjectionTestFS`. After injecting a write error, `db_stress` probabilistically determins
data drop (https://github.com/facebook/rocksdb/blob/6.27.fb/db_stress_tool/db_stress_test_base.cc#L2615:L2619).
In some of our recent runs of `db_stress`, we found duplicate trailing entries corresponding to file trivial move in
the MANIFEST, causing the recovery to fail, because the file move operation is not idempotent: you cannot delete a
file from a given level twice.
Investigation suggests that data buffering in both `WritableFileWriter` and `FaultInjectionTestFS` may be the root cause.
WritableFileWriter buffers data to write in a memory buffer, `WritableFileWriter::buf_`. After each
`WriteBuffered()`/`WriteBufferedWithChecksum()` succeeds, the `buf_` is cleared.
If the underlying file `WritableFileWriter::writable_file_` is opened in buffered IO mode, then `FaultInjectionTestFS`
buffers data written for each file until next file sync. After an injected error, user of `FaultInjectionFS` can
choose to drop some or none of previously buffered data. If `db_stress` does not drop any unsynced data, then
such data will still exist in the `FaultInjectionTestFS`'s buffer.
Existing implementation of `WritableileWriter::WriteBuffered()` does not clear `buf_` if there is an error. This may lead
to the data being buffered two copies: one in `WritableFileWriter`, and another in `FaultInjectionTestFS`.
We also know that the `WritableFileWriter` of MANIFEST file will close upon an error. During `Close()`, it will flush the
content in `buf_`. If no write error is injected to `FaultInjectionTestFS` this time, then we end up with two copies of the
data appended to the file.
To fix, we clear the `WritableFileWriter::buf_` upon failure as well. We focus this PR on files opened in non-direct mode.
This PR includes a unit test to reproduce a case when write error injection
to `WritableFile` can cause duplicate trailing entries.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9236
Test Plan: make check
Reviewed By: zhichao-cao
Differential Revision: D33033984
Pulled By: riversand963
fbshipit-source-id: ebfa5a0db8cbf1ed73100528b34fcba543c5db31
2021-12-13 16:59:20 +00:00
|
|
|
#ifndef NDEBUG
|
|
|
|
#include "utilities/fault_injection_fs.h"
|
|
|
|
#endif
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2015-01-15 18:28:10 +00:00
|
|
|
|
|
|
|
static const int kValueSize = 1000;
|
|
|
|
static const int kMaxNumValues = 2000;
|
|
|
|
static const size_t kNumIterations = 3;
|
|
|
|
|
2018-05-07 19:15:54 +00:00
|
|
|
enum FaultInjectionOptionConfig {
|
|
|
|
kDefault,
|
|
|
|
kDifferentDataDir,
|
|
|
|
kWalDir,
|
|
|
|
kSyncWal,
|
|
|
|
kWalDirSyncWal,
|
|
|
|
kMultiLevels,
|
|
|
|
kEnd,
|
|
|
|
};
|
|
|
|
class FaultInjectionTest
|
|
|
|
: public testing::Test,
|
|
|
|
public testing::WithParamInterface<std::tuple<
|
|
|
|
bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
|
2015-01-23 02:34:23 +00:00
|
|
|
protected:
|
|
|
|
int option_config_;
|
2018-05-07 19:15:54 +00:00
|
|
|
int non_inclusive_end_range_; // kEnd or equivalent to that
|
2015-01-23 02:34:23 +00:00
|
|
|
// When need to make sure data is persistent, sync WAL
|
2015-01-24 00:03:24 +00:00
|
|
|
bool sync_use_wal_;
|
2015-01-23 02:34:23 +00:00
|
|
|
// When need to make sure data is persistent, call DB::CompactRange()
|
2015-01-24 00:03:24 +00:00
|
|
|
bool sync_use_compact_;
|
2015-01-23 02:34:23 +00:00
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
bool sequential_order_;
|
|
|
|
|
2015-01-15 18:28:10 +00:00
|
|
|
public:
|
2015-01-23 02:34:23 +00:00
|
|
|
enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
|
|
|
|
enum ResetMethod {
|
|
|
|
kResetDropUnsyncedData,
|
2015-01-26 23:22:18 +00:00
|
|
|
kResetDropRandomUnsyncedData,
|
2015-01-23 02:34:23 +00:00
|
|
|
kResetDeleteUnsyncedFiles,
|
|
|
|
kResetDropAndDeleteUnsynced
|
|
|
|
};
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2015-01-27 22:44:19 +00:00
|
|
|
std::unique_ptr<Env> base_env_;
|
2015-01-15 18:28:10 +00:00
|
|
|
FaultInjectionTestEnv* env_;
|
|
|
|
std::string dbname_;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::shared_ptr<Cache> tiny_cache_;
|
2015-01-15 18:28:10 +00:00
|
|
|
Options options_;
|
|
|
|
DB* db_;
|
|
|
|
|
2015-01-23 02:34:23 +00:00
|
|
|
FaultInjectionTest()
|
2018-05-07 19:15:54 +00:00
|
|
|
: option_config_(std::get<1>(GetParam())),
|
|
|
|
non_inclusive_end_range_(std::get<2>(GetParam())),
|
2015-01-24 00:03:24 +00:00
|
|
|
sync_use_wal_(false),
|
|
|
|
sync_use_compact_(true),
|
2015-01-27 22:44:19 +00:00
|
|
|
base_env_(nullptr),
|
2018-03-07 20:39:19 +00:00
|
|
|
env_(nullptr),
|
2021-11-08 19:04:01 +00:00
|
|
|
db_(nullptr) {
|
|
|
|
EXPECT_OK(
|
|
|
|
test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_));
|
|
|
|
EXPECT_NE(system_env_, nullptr);
|
|
|
|
}
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
~FaultInjectionTest() override {
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
2015-07-16 19:18:32 +00:00
|
|
|
}
|
|
|
|
|
2015-01-23 02:34:23 +00:00
|
|
|
bool ChangeOptions() {
|
|
|
|
option_config_++;
|
2018-05-07 19:15:54 +00:00
|
|
|
if (option_config_ >= non_inclusive_end_range_) {
|
2015-01-23 02:34:23 +00:00
|
|
|
return false;
|
|
|
|
} else {
|
2015-01-27 22:44:19 +00:00
|
|
|
if (option_config_ == kMultiLevels) {
|
2021-11-08 19:04:01 +00:00
|
|
|
base_env_.reset(MockEnv::Create(system_env_));
|
2015-01-27 22:44:19 +00:00
|
|
|
}
|
2015-01-23 02:34:23 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the current option configuration.
|
|
|
|
Options CurrentOptions() {
|
2015-01-24 00:03:24 +00:00
|
|
|
sync_use_wal_ = false;
|
|
|
|
sync_use_compact_ = true;
|
2015-01-23 02:34:23 +00:00
|
|
|
Options options;
|
|
|
|
switch (option_config_) {
|
|
|
|
case kWalDir:
|
2018-07-14 00:18:39 +00:00
|
|
|
options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
|
2015-01-23 02:34:23 +00:00
|
|
|
break;
|
|
|
|
case kDifferentDataDir:
|
2018-07-14 00:18:39 +00:00
|
|
|
options.db_paths.emplace_back(
|
|
|
|
test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
|
2015-01-23 02:34:23 +00:00
|
|
|
break;
|
|
|
|
case kSyncWal:
|
2015-01-24 00:03:24 +00:00
|
|
|
sync_use_wal_ = true;
|
|
|
|
sync_use_compact_ = false;
|
2015-01-23 02:34:23 +00:00
|
|
|
break;
|
|
|
|
case kWalDirSyncWal:
|
2018-07-14 00:18:39 +00:00
|
|
|
options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
|
2015-01-24 00:03:24 +00:00
|
|
|
sync_use_wal_ = true;
|
|
|
|
sync_use_compact_ = false;
|
2015-01-23 02:34:23 +00:00
|
|
|
break;
|
2015-01-27 22:44:19 +00:00
|
|
|
case kMultiLevels:
|
|
|
|
options.write_buffer_size = 64 * 1024;
|
|
|
|
options.target_file_size_base = 64 * 1024;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
options.level0_slowdown_writes_trigger = 2;
|
|
|
|
options.level0_stop_writes_trigger = 4;
|
|
|
|
options.max_bytes_for_level_base = 128 * 1024;
|
|
|
|
options.max_write_buffer_number = 2;
|
|
|
|
options.max_background_compactions = 8;
|
|
|
|
options.max_background_flushes = 8;
|
|
|
|
sync_use_wal_ = true;
|
|
|
|
sync_use_compact_ = false;
|
|
|
|
break;
|
2015-01-23 02:34:23 +00:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
2015-01-15 18:28:10 +00:00
|
|
|
Status NewDB() {
|
2018-03-07 20:39:19 +00:00
|
|
|
assert(db_ == nullptr);
|
2015-01-15 18:28:10 +00:00
|
|
|
assert(tiny_cache_ == nullptr);
|
2018-03-07 20:39:19 +00:00
|
|
|
assert(env_ == nullptr);
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2021-11-08 19:04:01 +00:00
|
|
|
env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_);
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2015-01-23 02:34:23 +00:00
|
|
|
options_ = CurrentOptions();
|
2015-01-15 18:28:10 +00:00
|
|
|
options_.env = env_;
|
|
|
|
options_.paranoid_checks = true;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
tiny_cache_ = NewLRUCache(100);
|
|
|
|
table_options.block_cache = tiny_cache_;
|
|
|
|
options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
2018-07-14 00:18:39 +00:00
|
|
|
dbname_ = test::PerThreadDBPath("fault_test");
|
2015-01-15 18:28:10 +00:00
|
|
|
|
rocksdb: Replace ASSERT* with EXPECT* in functions that does not return void value
Summary:
gtest does not use exceptions to fail a unit test by design, and `ASSERT*`s are implemented using `return`. As a consequence we cannot use `ASSERT*` in a function that does not return `void` value ([[ https://code.google.com/p/googletest/wiki/AdvancedGuide#Assertion_Placement | 1]]), and have to fix our existing code. This diff does this in a generic way, with no manual changes.
In order to detect all existing `ASSERT*` that are used in functions that doesn't return void value, I change the code to generate compile errors for such cases.
In `util/testharness.h` I defined `EXPECT*` assertions, the same way as `ASSERT*`, and redefined `ASSERT*` to return `void`. Then executed:
```lang=bash
% USE_CLANG=1 make all -j55 -k 2> build.log
% perl -naF: -e 'print "-- -number=".$F[1]." ".$F[0]."\n" if /: error:/' \
build.log | xargs -L 1 perl -spi -e 's/ASSERT/EXPECT/g if $. == $number'
% make format
```
After that I reverted back change to `ASSERT*` in `util/testharness.h`. But preserved introduced `EXPECT*`, which is the same as `ASSERT*`. This will be deleted once switched to gtest.
This diff is independent and contains manual changes only in `util/testharness.h`.
Test Plan:
Make sure all tests are passing.
```lang=bash
% USE_CLANG=1 make check
```
Reviewers: igor, lgalanis, sdong, yufei.zhu, rven, meyering
Reviewed By: meyering
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D33333
2015-03-17 03:52:32 +00:00
|
|
|
EXPECT_OK(DestroyDB(dbname_, options_));
|
2015-01-23 02:34:23 +00:00
|
|
|
|
2015-01-15 18:28:10 +00:00
|
|
|
options_.create_if_missing = true;
|
|
|
|
Status s = OpenDB();
|
|
|
|
options_.create_if_missing = false;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
void SetUp() override {
|
2018-05-07 19:15:54 +00:00
|
|
|
sequential_order_ = std::get<0>(GetParam());
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(NewDB());
|
|
|
|
}
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
void TearDown() override {
|
2015-01-15 18:28:10 +00:00
|
|
|
CloseDB();
|
|
|
|
|
2015-01-23 02:34:23 +00:00
|
|
|
Status s = DestroyDB(dbname_, options_);
|
2015-01-15 18:28:10 +00:00
|
|
|
|
|
|
|
delete env_;
|
2018-03-07 20:39:19 +00:00
|
|
|
env_ = nullptr;
|
2015-01-15 18:28:10 +00:00
|
|
|
|
|
|
|
tiny_cache_.reset();
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
ASSERT_OK(s);
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
|
2015-01-15 18:28:10 +00:00
|
|
|
std::string key_space, value_space;
|
|
|
|
WriteBatch batch;
|
|
|
|
for (int i = start_idx; i < start_idx + num_vals; i++) {
|
2015-07-16 19:18:32 +00:00
|
|
|
Slice key = Key(i, &key_space);
|
2015-01-15 18:28:10 +00:00
|
|
|
batch.Clear();
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(batch.Put(key, Value(i, &value_space)));
|
2015-01-23 02:34:23 +00:00
|
|
|
ASSERT_OK(db_->Write(write_options, &batch));
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
Status ReadValue(int i, std::string* val) const {
|
2015-01-15 18:28:10 +00:00
|
|
|
std::string key_space, value_space;
|
2015-07-16 19:18:32 +00:00
|
|
|
Slice key = Key(i, &key_space);
|
2015-01-15 18:28:10 +00:00
|
|
|
Value(i, &value_space);
|
|
|
|
ReadOptions options;
|
|
|
|
return db_->Get(options, key, val);
|
|
|
|
}
|
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
Status Verify(int start_idx, int num_vals,
|
|
|
|
ExpectedVerifResult expected) const {
|
2015-01-15 18:28:10 +00:00
|
|
|
std::string val;
|
|
|
|
std::string value_space;
|
|
|
|
Status s;
|
|
|
|
for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
|
|
|
|
Value(i, &value_space);
|
2015-07-16 19:18:32 +00:00
|
|
|
s = ReadValue(i, &val);
|
2015-01-23 02:34:23 +00:00
|
|
|
if (s.ok()) {
|
rocksdb: Replace ASSERT* with EXPECT* in functions that does not return void value
Summary:
gtest does not use exceptions to fail a unit test by design, and `ASSERT*`s are implemented using `return`. As a consequence we cannot use `ASSERT*` in a function that does not return `void` value ([[ https://code.google.com/p/googletest/wiki/AdvancedGuide#Assertion_Placement | 1]]), and have to fix our existing code. This diff does this in a generic way, with no manual changes.
In order to detect all existing `ASSERT*` that are used in functions that doesn't return void value, I change the code to generate compile errors for such cases.
In `util/testharness.h` I defined `EXPECT*` assertions, the same way as `ASSERT*`, and redefined `ASSERT*` to return `void`. Then executed:
```lang=bash
% USE_CLANG=1 make all -j55 -k 2> build.log
% perl -naF: -e 'print "-- -number=".$F[1]." ".$F[0]."\n" if /: error:/' \
build.log | xargs -L 1 perl -spi -e 's/ASSERT/EXPECT/g if $. == $number'
% make format
```
After that I reverted back change to `ASSERT*` in `util/testharness.h`. But preserved introduced `EXPECT*`, which is the same as `ASSERT*`. This will be deleted once switched to gtest.
This diff is independent and contains manual changes only in `util/testharness.h`.
Test Plan:
Make sure all tests are passing.
```lang=bash
% USE_CLANG=1 make check
```
Reviewers: igor, lgalanis, sdong, yufei.zhu, rven, meyering
Reviewed By: meyering
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D33333
2015-03-17 03:52:32 +00:00
|
|
|
EXPECT_EQ(value_space, val);
|
2015-01-23 02:34:23 +00:00
|
|
|
}
|
|
|
|
if (expected == kValExpectFound) {
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
|
|
|
|
s.ToString().c_str());
|
|
|
|
return s;
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
2015-01-23 02:34:23 +00:00
|
|
|
} else if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
fprintf(stderr, "Error when read %dth record: %s\n", i,
|
|
|
|
s.ToString().c_str());
|
|
|
|
return s;
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
}
|
2015-01-23 02:34:23 +00:00
|
|
|
return Status::OK();
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Return the ith key
|
2015-07-16 19:18:32 +00:00
|
|
|
Slice Key(int i, std::string* storage) const {
|
2018-02-13 22:07:48 +00:00
|
|
|
unsigned long long num = i;
|
2015-07-16 19:18:32 +00:00
|
|
|
if (!sequential_order_) {
|
2015-07-16 02:58:28 +00:00
|
|
|
// random transfer
|
|
|
|
const int m = 0x5bd1e995;
|
|
|
|
num *= m;
|
|
|
|
num ^= num << 24;
|
|
|
|
}
|
2015-01-15 18:28:10 +00:00
|
|
|
char buf[100];
|
2018-02-13 22:07:48 +00:00
|
|
|
snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
|
2015-01-15 18:28:10 +00:00
|
|
|
storage->assign(buf, strlen(buf));
|
|
|
|
return Slice(*storage);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the value to associate with the specified key
|
|
|
|
Slice Value(int k, std::string* storage) const {
|
|
|
|
Random r(k);
|
2020-07-09 21:33:42 +00:00
|
|
|
*storage = r.RandomString(kValueSize);
|
|
|
|
return Slice(*storage);
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
2015-12-15 23:26:20 +00:00
|
|
|
void CloseDB() {
|
2015-01-15 18:28:10 +00:00
|
|
|
delete db_;
|
2017-05-19 17:43:11 +00:00
|
|
|
db_ = nullptr;
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
2015-12-15 23:26:20 +00:00
|
|
|
Status OpenDB() {
|
|
|
|
CloseDB();
|
|
|
|
env_->ResetState();
|
2017-05-19 17:43:11 +00:00
|
|
|
Status s = DB::Open(options_, dbname_, &db_);
|
|
|
|
assert(db_ != nullptr);
|
|
|
|
return s;
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void DeleteAllData() {
|
|
|
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
|
|
|
WriteOptions options;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
|
|
|
|
}
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2015-01-15 18:28:10 +00:00
|
|
|
delete iter;
|
2015-01-23 02:34:23 +00:00
|
|
|
|
|
|
|
FlushOptions flush_options;
|
|
|
|
flush_options.wait = true;
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(db_->Flush(flush_options));
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
2015-01-26 23:22:18 +00:00
|
|
|
// rnd cannot be null for kResetDropRandomUnsyncedData
|
|
|
|
void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
|
2015-01-28 00:34:16 +00:00
|
|
|
env_->AssertNoOpenFile();
|
2015-01-15 18:28:10 +00:00
|
|
|
switch (reset_method) {
|
2015-01-23 02:34:23 +00:00
|
|
|
case kResetDropUnsyncedData:
|
2015-01-15 18:28:10 +00:00
|
|
|
ASSERT_OK(env_->DropUnsyncedFileData());
|
|
|
|
break;
|
2015-01-26 23:22:18 +00:00
|
|
|
case kResetDropRandomUnsyncedData:
|
|
|
|
ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
|
|
|
|
break;
|
2015-01-23 02:34:23 +00:00
|
|
|
case kResetDeleteUnsyncedFiles:
|
|
|
|
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
|
|
|
|
break;
|
|
|
|
case kResetDropAndDeleteUnsynced:
|
|
|
|
ASSERT_OK(env_->DropUnsyncedFileData());
|
2015-01-15 18:28:10 +00:00
|
|
|
ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
|
|
|
|
DeleteAllData();
|
2015-01-23 02:34:23 +00:00
|
|
|
|
|
|
|
WriteOptions write_options;
|
2015-01-24 00:03:24 +00:00
|
|
|
write_options.sync = sync_use_wal_;
|
2015-01-23 02:34:23 +00:00
|
|
|
|
|
|
|
Build(write_options, 0, num_pre_sync);
|
2015-01-24 00:03:24 +00:00
|
|
|
if (sync_use_compact_) {
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
2015-01-23 02:34:23 +00:00
|
|
|
}
|
|
|
|
write_options.sync = false;
|
|
|
|
Build(write_options, num_pre_sync, num_post_sync);
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void PartialCompactTestReopenWithFault(ResetMethod reset_method,
|
2015-01-26 23:22:18 +00:00
|
|
|
int num_pre_sync, int num_post_sync,
|
|
|
|
Random* rnd = nullptr) {
|
2015-01-15 18:28:10 +00:00
|
|
|
env_->SetFilesystemActive(false);
|
|
|
|
CloseDB();
|
2015-01-26 23:22:18 +00:00
|
|
|
ResetDBState(reset_method, rnd);
|
2015-01-15 18:28:10 +00:00
|
|
|
ASSERT_OK(OpenDB());
|
2015-01-23 02:34:23 +00:00
|
|
|
ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
|
2015-01-15 18:28:10 +00:00
|
|
|
ASSERT_OK(Verify(num_pre_sync, num_post_sync,
|
2015-01-23 02:34:23 +00:00
|
|
|
FaultInjectionTest::kValExpectNoError));
|
2015-07-16 19:18:32 +00:00
|
|
|
WaitCompactionFinish();
|
|
|
|
ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
|
|
|
|
ASSERT_OK(Verify(num_pre_sync, num_post_sync,
|
|
|
|
FaultInjectionTest::kValExpectNoError));
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
void NoWriteTestPreFault() {}
|
2015-01-15 18:28:10 +00:00
|
|
|
|
|
|
|
void NoWriteTestReopenWithFault(ResetMethod reset_method) {
|
|
|
|
CloseDB();
|
|
|
|
ResetDBState(reset_method);
|
|
|
|
ASSERT_OK(OpenDB());
|
|
|
|
}
|
2015-07-16 19:18:32 +00:00
|
|
|
|
|
|
|
void WaitCompactionFinish() {
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "", ""));
|
|
|
|
}
|
2021-11-08 19:04:01 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
Env* system_env_;
|
|
|
|
std::shared_ptr<Env> env_guard_;
|
2015-01-15 18:28:10 +00:00
|
|
|
};
|
|
|
|
|
2018-05-07 19:15:54 +00:00
|
|
|
class FaultInjectionTestSplitted : public FaultInjectionTest {};
|
|
|
|
|
|
|
|
TEST_P(FaultInjectionTestSplitted, FaultTest) {
|
2015-01-23 02:34:23 +00:00
|
|
|
do {
|
|
|
|
Random rnd(301);
|
|
|
|
|
2015-01-24 00:26:38 +00:00
|
|
|
for (size_t idx = 0; idx < kNumIterations; idx++) {
|
|
|
|
int num_pre_sync = rnd.Uniform(kMaxNumValues);
|
|
|
|
int num_post_sync = rnd.Uniform(kMaxNumValues);
|
2015-01-15 18:28:10 +00:00
|
|
|
|
2015-01-23 02:34:23 +00:00
|
|
|
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
2015-01-24 00:26:38 +00:00
|
|
|
PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
|
|
|
|
num_post_sync);
|
2015-01-23 02:34:23 +00:00
|
|
|
NoWriteTestPreFault();
|
2015-01-24 00:26:38 +00:00
|
|
|
NoWriteTestReopenWithFault(kResetDropUnsyncedData);
|
|
|
|
|
2015-01-26 23:22:18 +00:00
|
|
|
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
|
|
|
PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
|
|
|
|
num_pre_sync, num_post_sync, &rnd);
|
|
|
|
NoWriteTestPreFault();
|
|
|
|
NoWriteTestReopenWithFault(kResetDropUnsyncedData);
|
|
|
|
|
2015-01-24 00:26:38 +00:00
|
|
|
// Setting a separate data path won't pass the test as we don't sync
|
|
|
|
// it after creating new files,
|
2015-01-26 21:59:38 +00:00
|
|
|
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
|
|
|
PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
|
|
|
|
num_pre_sync, num_post_sync);
|
|
|
|
NoWriteTestPreFault();
|
|
|
|
NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
|
|
|
|
|
|
|
|
PartialCompactTestPreFault(num_pre_sync, num_post_sync);
|
|
|
|
// No new files created so we expect all values since no files will be
|
|
|
|
// dropped.
|
|
|
|
PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
|
|
|
|
num_post_sync);
|
|
|
|
NoWriteTestPreFault();
|
|
|
|
NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
|
2015-01-23 02:34:23 +00:00
|
|
|
}
|
|
|
|
} while (ChangeOptions());
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|
|
|
|
|
2015-04-09 04:18:05 +00:00
|
|
|
// Previous log file is not fsynced if sync is forced after log rolling.
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 10:28:08 +00:00
|
|
|
TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
|
2015-09-25 17:29:44 +00:00
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
2015-04-09 04:18:05 +00:00
|
|
|
env_->SetBackgroundThreads(1, Env::HIGH);
|
|
|
|
// Block the job queue to prevent flush job from running.
|
2015-09-25 17:29:44 +00:00
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
2015-04-09 04:18:05 +00:00
|
|
|
Env::Priority::HIGH);
|
2016-01-06 01:56:29 +00:00
|
|
|
sleeping_task_low.WaitUntilSleeping();
|
2015-04-09 04:18:05 +00:00
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = false;
|
|
|
|
|
|
|
|
std::string key_space, value_space;
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
|
2015-04-09 04:18:05 +00:00
|
|
|
FlushOptions flush_options;
|
|
|
|
flush_options.wait = false;
|
|
|
|
ASSERT_OK(db_->Flush(flush_options));
|
|
|
|
write_options.sync = true;
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(db_->FlushWAL(false));
|
2015-04-09 04:18:05 +00:00
|
|
|
|
|
|
|
env_->SetFilesystemActive(false);
|
|
|
|
NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
|
|
|
|
sleeping_task_low.WakeUp();
|
2016-08-10 17:58:15 +00:00
|
|
|
sleeping_task_low.WaitUntilDone();
|
2015-04-09 04:18:05 +00:00
|
|
|
|
|
|
|
ASSERT_OK(OpenDB());
|
|
|
|
std::string val;
|
|
|
|
Value(2, &value_space);
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(ReadValue(2, &val));
|
2015-04-09 04:18:05 +00:00
|
|
|
ASSERT_EQ(value_space, val);
|
|
|
|
|
|
|
|
Value(1, &value_space);
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(ReadValue(1, &val));
|
2015-04-09 04:18:05 +00:00
|
|
|
ASSERT_EQ(value_space, val);
|
|
|
|
}
|
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
TEST_P(FaultInjectionTest, UninstalledCompaction) {
|
2015-07-16 02:58:28 +00:00
|
|
|
options_.target_file_size_base = 32 * 1024;
|
|
|
|
options_.write_buffer_size = 100 << 10; // 100KB
|
|
|
|
options_.level0_file_num_compaction_trigger = 6;
|
|
|
|
options_.level0_stop_writes_trigger = 1 << 10;
|
|
|
|
options_.level0_slowdown_writes_trigger = 1 << 10;
|
|
|
|
options_.max_background_compactions = 1;
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(OpenDB());
|
2015-07-16 02:58:28 +00:00
|
|
|
|
2015-07-16 19:18:32 +00:00
|
|
|
if (!sequential_order_) {
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
|
2015-07-16 19:18:32 +00:00
|
|
|
{"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
|
|
|
|
{"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
|
|
|
|
{"FaultInjectionTest::FaultTest:2",
|
|
|
|
"DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
|
|
|
|
});
|
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
2015-07-16 02:58:28 +00:00
|
|
|
|
|
|
|
int kNumKeys = 1000;
|
2015-07-16 19:18:32 +00:00
|
|
|
Build(WriteOptions(), 0, kNumKeys);
|
2015-07-16 02:58:28 +00:00
|
|
|
FlushOptions flush_options;
|
|
|
|
flush_options.wait = true;
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(db_->Flush(flush_options));
|
2015-07-16 02:58:28 +00:00
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "", ""));
|
|
|
|
TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
|
|
|
|
TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
|
|
|
|
env_->SetFilesystemActive(false);
|
|
|
|
TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
|
|
|
|
CloseDB();
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
2015-07-16 02:58:28 +00:00
|
|
|
ResetDBState(kResetDropUnsyncedData);
|
|
|
|
|
|
|
|
std::atomic<bool> opened(false);
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
2018-04-13 00:55:14 +00:00
|
|
|
"DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
2015-07-16 02:58:28 +00:00
|
|
|
"DBImpl::BGWorkCompaction",
|
2018-04-13 00:55:14 +00:00
|
|
|
[&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
2015-07-16 02:58:28 +00:00
|
|
|
ASSERT_OK(OpenDB());
|
2015-07-16 19:18:32 +00:00
|
|
|
ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
|
|
|
|
WaitCompactionFinish();
|
|
|
|
ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
2015-07-16 02:58:28 +00:00
|
|
|
}
|
|
|
|
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
TEST_P(FaultInjectionTest, ManualLogSyncTest) {
|
2015-09-25 17:29:44 +00:00
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
env_->SetBackgroundThreads(1, Env::HIGH);
|
|
|
|
// Block the job queue to prevent flush job from running.
|
2015-09-25 17:29:44 +00:00
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
Env::Priority::HIGH);
|
2016-01-06 01:56:29 +00:00
|
|
|
sleeping_task_low.WaitUntilSleeping();
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
|
|
|
|
WriteOptions write_options;
|
|
|
|
write_options.sync = false;
|
|
|
|
|
|
|
|
std::string key_space, value_space;
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
|
|
|
|
FlushOptions flush_options;
|
|
|
|
flush_options.wait = false;
|
|
|
|
ASSERT_OK(db_->Flush(flush_options));
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
|
2017-06-24 21:06:43 +00:00
|
|
|
ASSERT_OK(db_->FlushWAL(true));
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
|
|
|
|
env_->SetFilesystemActive(false);
|
|
|
|
NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
|
|
|
|
sleeping_task_low.WakeUp();
|
2016-08-10 17:58:15 +00:00
|
|
|
sleeping_task_low.WaitUntilDone();
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
|
|
|
|
ASSERT_OK(OpenDB());
|
|
|
|
std::string val;
|
|
|
|
Value(2, &value_space);
|
|
|
|
ASSERT_OK(ReadValue(2, &val));
|
|
|
|
ASSERT_EQ(value_space, val);
|
|
|
|
|
|
|
|
Value(1, &value_space);
|
|
|
|
ASSERT_OK(ReadValue(1, &val));
|
|
|
|
ASSERT_EQ(value_space, val);
|
|
|
|
}
|
|
|
|
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
|
|
|
|
ReadOptions ro;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
|
|
|
|
WriteOptions wo;
|
|
|
|
wo.sync = true;
|
|
|
|
wo.disableWAL = false;
|
|
|
|
WriteBatch batch;
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(batch.Put("cats", "dogs"));
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
batch.MarkWalTerminationPoint();
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(batch.Put("boys", "girls"));
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
ASSERT_OK(db_->Write(wo, &batch));
|
|
|
|
|
|
|
|
env_->SetFilesystemActive(false);
|
|
|
|
NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
|
|
|
|
ASSERT_OK(OpenDB());
|
|
|
|
|
|
|
|
std::string val;
|
|
|
|
ASSERT_OK(db_->Get(ro, "cats", &val));
|
|
|
|
ASSERT_EQ("dogs", val);
|
|
|
|
ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
|
|
|
|
}
|
|
|
|
|
Fix a bug causing duplicate trailing entries in WritableFile (buffered IO) (#9236)
Summary:
`db_stress` is a user of `FaultInjectionTestFS`. After injecting a write error, `db_stress` probabilistically determins
data drop (https://github.com/facebook/rocksdb/blob/6.27.fb/db_stress_tool/db_stress_test_base.cc#L2615:L2619).
In some of our recent runs of `db_stress`, we found duplicate trailing entries corresponding to file trivial move in
the MANIFEST, causing the recovery to fail, because the file move operation is not idempotent: you cannot delete a
file from a given level twice.
Investigation suggests that data buffering in both `WritableFileWriter` and `FaultInjectionTestFS` may be the root cause.
WritableFileWriter buffers data to write in a memory buffer, `WritableFileWriter::buf_`. After each
`WriteBuffered()`/`WriteBufferedWithChecksum()` succeeds, the `buf_` is cleared.
If the underlying file `WritableFileWriter::writable_file_` is opened in buffered IO mode, then `FaultInjectionTestFS`
buffers data written for each file until next file sync. After an injected error, user of `FaultInjectionFS` can
choose to drop some or none of previously buffered data. If `db_stress` does not drop any unsynced data, then
such data will still exist in the `FaultInjectionTestFS`'s buffer.
Existing implementation of `WritableileWriter::WriteBuffered()` does not clear `buf_` if there is an error. This may lead
to the data being buffered two copies: one in `WritableFileWriter`, and another in `FaultInjectionTestFS`.
We also know that the `WritableFileWriter` of MANIFEST file will close upon an error. During `Close()`, it will flush the
content in `buf_`. If no write error is injected to `FaultInjectionTestFS` this time, then we end up with two copies of the
data appended to the file.
To fix, we clear the `WritableFileWriter::buf_` upon failure as well. We focus this PR on files opened in non-direct mode.
This PR includes a unit test to reproduce a case when write error injection
to `WritableFile` can cause duplicate trailing entries.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9236
Test Plan: make check
Reviewed By: zhichao-cao
Differential Revision: D33033984
Pulled By: riversand963
fbshipit-source-id: ebfa5a0db8cbf1ed73100528b34fcba543c5db31
2021-12-13 16:59:20 +00:00
|
|
|
TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) {
|
|
|
|
auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
|
|
|
|
fault_fs->EnableWriteErrorInjection();
|
|
|
|
fault_fs->SetFilesystemDirectWritable(false);
|
|
|
|
const std::string file_name = NormalizePath(dbname_ + "/test_file");
|
|
|
|
std::unique_ptr<log::Writer> log_writer = nullptr;
|
|
|
|
constexpr uint64_t log_number = 0;
|
|
|
|
{
|
|
|
|
std::unique_ptr<FSWritableFile> file;
|
|
|
|
const Status s =
|
|
|
|
fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
std::unique_ptr<WritableFileWriter> fwriter(
|
|
|
|
new WritableFileWriter(std::move(file), file_name, FileOptions()));
|
|
|
|
log_writer.reset(new log::Writer(std::move(fwriter), log_number,
|
|
|
|
/*recycle_log_files=*/false));
|
|
|
|
}
|
|
|
|
|
|
|
|
fault_fs->SetRandomWriteError(
|
|
|
|
0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"),
|
|
|
|
/*inject_for_all_file_types=*/true, /*types=*/{});
|
|
|
|
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.SetColumnFamily(0);
|
|
|
|
std::string buf;
|
|
|
|
assert(edit.EncodeTo(&buf));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const Status s = log_writer->AddRecord(WriteOptions(), buf);
|
Fix a bug causing duplicate trailing entries in WritableFile (buffered IO) (#9236)
Summary:
`db_stress` is a user of `FaultInjectionTestFS`. After injecting a write error, `db_stress` probabilistically determins
data drop (https://github.com/facebook/rocksdb/blob/6.27.fb/db_stress_tool/db_stress_test_base.cc#L2615:L2619).
In some of our recent runs of `db_stress`, we found duplicate trailing entries corresponding to file trivial move in
the MANIFEST, causing the recovery to fail, because the file move operation is not idempotent: you cannot delete a
file from a given level twice.
Investigation suggests that data buffering in both `WritableFileWriter` and `FaultInjectionTestFS` may be the root cause.
WritableFileWriter buffers data to write in a memory buffer, `WritableFileWriter::buf_`. After each
`WriteBuffered()`/`WriteBufferedWithChecksum()` succeeds, the `buf_` is cleared.
If the underlying file `WritableFileWriter::writable_file_` is opened in buffered IO mode, then `FaultInjectionTestFS`
buffers data written for each file until next file sync. After an injected error, user of `FaultInjectionFS` can
choose to drop some or none of previously buffered data. If `db_stress` does not drop any unsynced data, then
such data will still exist in the `FaultInjectionTestFS`'s buffer.
Existing implementation of `WritableileWriter::WriteBuffered()` does not clear `buf_` if there is an error. This may lead
to the data being buffered two copies: one in `WritableFileWriter`, and another in `FaultInjectionTestFS`.
We also know that the `WritableFileWriter` of MANIFEST file will close upon an error. During `Close()`, it will flush the
content in `buf_`. If no write error is injected to `FaultInjectionTestFS` this time, then we end up with two copies of the
data appended to the file.
To fix, we clear the `WritableFileWriter::buf_` upon failure as well. We focus this PR on files opened in non-direct mode.
This PR includes a unit test to reproduce a case when write error injection
to `WritableFile` can cause duplicate trailing entries.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9236
Test Plan: make check
Reviewed By: zhichao-cao
Differential Revision: D33033984
Pulled By: riversand963
fbshipit-source-id: ebfa5a0db8cbf1ed73100528b34fcba543c5db31
2021-12-13 16:59:20 +00:00
|
|
|
ASSERT_NOK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
fault_fs->DisableWriteErrorInjection();
|
|
|
|
|
|
|
|
// Closing the log writer will cause WritableFileWriter::Close() and flush
|
|
|
|
// remaining data from its buffer to underlying file.
|
|
|
|
log_writer.reset();
|
|
|
|
|
|
|
|
{
|
|
|
|
std::unique_ptr<FSSequentialFile> file;
|
|
|
|
Status s =
|
|
|
|
fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
std::unique_ptr<SequentialFileReader> freader(
|
|
|
|
new SequentialFileReader(std::move(file), file_name));
|
|
|
|
Status log_read_s;
|
|
|
|
class LogReporter : public log::Reader::Reporter {
|
|
|
|
public:
|
|
|
|
Status* status_;
|
|
|
|
explicit LogReporter(Status* _s) : status_(_s) {}
|
|
|
|
void Corruption(size_t /*bytes*/, const Status& _s) override {
|
|
|
|
if (status_->ok()) {
|
|
|
|
*status_ = _s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} reporter(&log_read_s);
|
|
|
|
std::unique_ptr<log::Reader> log_reader(new log::Reader(
|
|
|
|
nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number));
|
|
|
|
Slice record;
|
|
|
|
std::string data;
|
|
|
|
size_t count = 0;
|
|
|
|
while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) {
|
|
|
|
VersionEdit edit;
|
|
|
|
ASSERT_OK(edit.DecodeFrom(data));
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
// Verify that only one version edit exists in the file.
|
|
|
|
ASSERT_EQ(1, count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2018-05-07 19:15:54 +00:00
|
|
|
FaultTest, FaultInjectionTest,
|
|
|
|
::testing::Values(std::make_tuple(false, kDefault, kEnd),
|
|
|
|
std::make_tuple(true, kDefault, kEnd)));
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2018-05-07 19:15:54 +00:00
|
|
|
FaultTest, FaultInjectionTestSplitted,
|
|
|
|
::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
|
|
|
|
std::make_tuple(true, kDefault, kSyncWal),
|
|
|
|
std::make_tuple(false, kSyncWal, kEnd),
|
|
|
|
std::make_tuple(true, kSyncWal, kEnd)));
|
2015-07-16 19:18:32 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2015-01-15 18:28:10 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2015-03-17 21:08:00 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
2021-11-08 19:04:01 +00:00
|
|
|
RegisterCustomObjects(argc, argv);
|
2015-03-17 21:08:00 +00:00
|
|
|
return RUN_ALL_TESTS();
|
2015-01-15 18:28:10 +00:00
|
|
|
}
|