mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 20:43:57 +00:00
06394ff4e7
Summary: When compaction filter determines that a key should be removed, it updates the internal key's type to `Delete`. If this internal key is preserved in current compaction but seen by a later compaction together with `SingleDelete`, it will cause compaction iterator to return Corruption. To fix the issue, compaction filter should return more information in addition to the intention of removing a key. Therefore, we add a new `kRemoveWithSingleDelete` to `CompactionFilter::Decision`. Seeing `kRemoveWithSingleDelete`, compaction iterator will update the op type of the internal key to `kTypeSingleDelete`. In addition, I updated db_stress_shared_state.[cc|h] so that `no_overwrite_ids_` becomes `const`. It is easier to reason about thread-safety if accessed from multiple threads. This information is passed to `PrepareTxnDBOptions()` when calling from `Open()` so that we can set up the rollback deletion type callback for transactions. Finally, disable compaction filter for multiops_txn because the key removal logic of `DbStressCompactionFilter` does not quite work with `MultiOpsTxnsStressTest`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9929 Test Plan: make check make crash_test make crash_test_with_txn Reviewed By: anand1976 Differential Revision: D36069678 Pulled By: riversand963 fbshipit-source-id: cedd2f1ba958af59ad3916f1ba6f424307955f92
438 lines
14 KiB
C++
438 lines
14 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#ifdef GFLAGS
|
|
#include "db_stress_tool/db_stress_common.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// This file defines MultiOpsTxnsStress so that we can stress test RocksDB
|
|
// transactions on a simple, emulated relational table.
|
|
//
|
|
// The record format is similar to the example found at
|
|
// https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format.
|
|
//
|
|
// The table is created by
|
|
// ```
|
|
// create table t1 (
|
|
// a int primary key,
|
|
// b int,
|
|
// c int,
|
|
// key(c),
|
|
// )
|
|
// ```
|
|
//
|
|
// (For simplicity, we use uint32_t for int here.)
|
|
//
|
|
// For this table, there is a primary index using `a`, as well as a secondary
|
|
// index using `c` and `a`.
|
|
//
|
|
// Primary key format:
|
|
// | index id | M(a) |
|
|
// Primary index value:
|
|
// | b | c |
|
|
// M(a) represents the big-endian format of a.
|
|
//
|
|
// Secondary key format:
|
|
// | index id | M(c) | M(a) |
|
|
// Secondary index value:
|
|
// | crc32 |
|
|
// Similarly to M(a), M(c) is the big-endian format of c.
|
|
//
|
|
// The in-memory representation of a record is defined in class
|
|
// MultiOpsTxnsStress:Record that includes a number of helper methods to
|
|
// encode/decode primary index keys, primary index values, secondary index keys,
|
|
// secondary index values, etc.
|
|
//
|
|
// Sometimes primary index and secondary index reside on different column
|
|
// families, but sometimes they colocate in the same column family. Current
|
|
// implementation puts them in the same (default) column family, and this is
|
|
// subject to future change if we find it interesting to test the other case.
|
|
//
|
|
// Class MultiOpsTxnsStressTest has the following transactions for testing.
|
|
//
|
|
// 1. Primary key update
|
|
// UPDATE t1 SET a = 3 WHERE a = 2;
|
|
// ```
|
|
// tx->GetForUpdate(primary key a=2)
|
|
// tx->GetForUpdate(primary key a=3)
|
|
// tx->Delete(primary key a=2)
|
|
// tx->Put(primary key a=3, value)
|
|
// tx->batch->SingleDelete(secondary key a=2)
|
|
// tx->batch->Put(secondary key a=3, value)
|
|
// tx->Prepare()
|
|
// Tx->Commit()
|
|
// ```
|
|
//
|
|
// 2. Secondary key update
|
|
// UPDATE t1 SET c = 3 WHERE c = 2;
|
|
// ```
|
|
// iter->Seek(secondary key)
|
|
// // Get corresponding primary key value(s) from iterator
|
|
// tx->GetForUpdate(primary key)
|
|
// tx->Put(primary key, value c=3)
|
|
// tx->batch->SingleDelete(secondary key c=2)
|
|
// tx->batch->Put(secondary key c=3)
|
|
// tx->Prepare()
|
|
// tx->Commit()
|
|
// ```
|
|
//
|
|
// 3. Primary index value update
|
|
// UPDATE t1 SET b = b + 1 WHERE a = 2;
|
|
// ```
|
|
// tx->GetForUpdate(primary key a=2)
|
|
// tx->Put(primary key a=2, value b=b+1)
|
|
// tx->Prepare()
|
|
// tx->Commit()
|
|
// ```
|
|
//
|
|
// 4. Point lookup
|
|
// SELECT * FROM t1 WHERE a = 3;
|
|
// ```
|
|
// tx->Get(primary key a=3)
|
|
// tx->Commit()
|
|
// ```
|
|
//
|
|
// 5. Range scan
|
|
// SELECT * FROM t1 WHERE c = 2;
|
|
// ```
|
|
// it = tx->GetIterator()
|
|
// it->Seek(secondary key c=2)
|
|
// tx->Commit()
|
|
// ```
|
|
|
|
class MultiOpsTxnsStressTest : public StressTest {
|
|
public:
|
|
class Record {
|
|
public:
|
|
static constexpr uint32_t kMetadataPrefix = 0;
|
|
static constexpr uint32_t kPrimaryIndexId = 1;
|
|
static constexpr uint32_t kSecondaryIndexId = 2;
|
|
|
|
static constexpr size_t kPrimaryIndexEntrySize = 8 + 8;
|
|
static constexpr size_t kSecondaryIndexEntrySize = 12 + 4;
|
|
|
|
static_assert(kPrimaryIndexId < kSecondaryIndexId,
|
|
"kPrimaryIndexId must be smaller than kSecondaryIndexId");
|
|
|
|
static_assert(sizeof(kPrimaryIndexId) == sizeof(uint32_t),
|
|
"kPrimaryIndexId must be 4 bytes");
|
|
static_assert(sizeof(kSecondaryIndexId) == sizeof(uint32_t),
|
|
"kSecondaryIndexId must be 4 bytes");
|
|
|
|
// Used for generating search key to probe primary index.
|
|
static std::string EncodePrimaryKey(uint32_t a);
|
|
// Used for generating search prefix to probe secondary index.
|
|
static std::string EncodeSecondaryKey(uint32_t c);
|
|
// Used for generating search key to probe secondary index.
|
|
static std::string EncodeSecondaryKey(uint32_t c, uint32_t a);
|
|
|
|
static std::tuple<Status, uint32_t, uint32_t> DecodePrimaryIndexValue(
|
|
Slice primary_index_value);
|
|
|
|
static std::pair<Status, uint32_t> DecodeSecondaryIndexValue(
|
|
Slice secondary_index_value);
|
|
|
|
Record() = default;
|
|
Record(uint32_t _a, uint32_t _b, uint32_t _c) : a_(_a), b_(_b), c_(_c) {}
|
|
|
|
bool operator==(const Record& other) const {
|
|
return a_ == other.a_ && b_ == other.b_ && c_ == other.c_;
|
|
}
|
|
|
|
bool operator!=(const Record& other) const { return !(*this == other); }
|
|
|
|
std::pair<std::string, std::string> EncodePrimaryIndexEntry() const;
|
|
|
|
std::string EncodePrimaryKey() const;
|
|
|
|
std::string EncodePrimaryIndexValue() const;
|
|
|
|
std::pair<std::string, std::string> EncodeSecondaryIndexEntry() const;
|
|
|
|
std::string EncodeSecondaryKey() const;
|
|
|
|
Status DecodePrimaryIndexEntry(Slice primary_index_key,
|
|
Slice primary_index_value);
|
|
|
|
Status DecodeSecondaryIndexEntry(Slice secondary_index_key,
|
|
Slice secondary_index_value);
|
|
|
|
uint32_t a_value() const { return a_; }
|
|
uint32_t b_value() const { return b_; }
|
|
uint32_t c_value() const { return c_; }
|
|
|
|
void SetA(uint32_t _a) { a_ = _a; }
|
|
void SetB(uint32_t _b) { b_ = _b; }
|
|
void SetC(uint32_t _c) { c_ = _c; }
|
|
|
|
std::string ToString() const {
|
|
std::string ret("(");
|
|
ret.append(std::to_string(a_));
|
|
ret.append(",");
|
|
ret.append(std::to_string(b_));
|
|
ret.append(",");
|
|
ret.append(std::to_string(c_));
|
|
ret.append(")");
|
|
return ret;
|
|
}
|
|
|
|
private:
|
|
friend class InvariantChecker;
|
|
|
|
uint32_t a_{0};
|
|
uint32_t b_{0};
|
|
uint32_t c_{0};
|
|
};
|
|
|
|
MultiOpsTxnsStressTest() {}
|
|
|
|
~MultiOpsTxnsStressTest() override {}
|
|
|
|
void FinishInitDb(SharedState*) override;
|
|
|
|
void ReopenAndPreloadDbIfNeeded(SharedState* shared);
|
|
|
|
bool IsStateTracked() const override { return false; }
|
|
|
|
Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
|
|
std::vector<Status> TestMultiGet(
|
|
ThreadState* thread, const ReadOptions& read_opts,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
|
|
Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
|
|
// Given a key K, this creates an iterator which scans to K and then
|
|
// does a random sequence of Next/Prev operations.
|
|
Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
|
|
Status TestPut(ThreadState* thread, WriteOptions& write_opts,
|
|
const ReadOptions& read_opts, const std::vector<int>& cf_ids,
|
|
const std::vector<int64_t>& keys, char (&value)[100],
|
|
std::unique_ptr<MutexLock>& lock) override;
|
|
|
|
Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys,
|
|
std::unique_ptr<MutexLock>& lock) override;
|
|
|
|
Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys,
|
|
std::unique_ptr<MutexLock>& lock) override;
|
|
|
|
void TestIngestExternalFile(ThreadState* thread,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys,
|
|
std::unique_ptr<MutexLock>& lock) override;
|
|
|
|
void TestCompactRange(ThreadState* thread, int64_t rand_key,
|
|
const Slice& start_key,
|
|
ColumnFamilyHandle* column_family) override;
|
|
|
|
Status TestBackupRestore(ThreadState* thread,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
|
|
Status TestCheckpoint(ThreadState* thread,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
Status TestApproximateSize(ThreadState* thread, uint64_t iteration,
|
|
const std::vector<int>& rand_column_families,
|
|
const std::vector<int64_t>& rand_keys) override;
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
Status TestCustomOperations(
|
|
ThreadState* thread,
|
|
const std::vector<int>& rand_column_families) override;
|
|
|
|
void RegisterAdditionalListeners() override;
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
void PrepareTxnDbOptions(SharedState* /*shared*/,
|
|
TransactionDBOptions& txn_db_opts) override;
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a,
|
|
uint32_t old_a_pos, uint32_t new_a);
|
|
|
|
Status SecondaryKeyUpdateTxn(ThreadState* thread, uint32_t old_c,
|
|
uint32_t old_c_pos, uint32_t new_c);
|
|
|
|
Status UpdatePrimaryIndexValueTxn(ThreadState* thread, uint32_t a,
|
|
uint32_t b_delta);
|
|
|
|
Status PointLookupTxn(ThreadState* thread, ReadOptions ropts, uint32_t a);
|
|
|
|
Status RangeScanTxn(ThreadState* thread, ReadOptions ropts, uint32_t c);
|
|
|
|
void VerifyDb(ThreadState* thread) const override;
|
|
|
|
void ContinuouslyVerifyDb(ThreadState* thread) const override {
|
|
VerifyDb(thread);
|
|
}
|
|
|
|
void VerifyPkSkFast(int job_id);
|
|
|
|
protected:
|
|
class Counter {
|
|
public:
|
|
uint64_t Next() { return value_.fetch_add(1); }
|
|
|
|
private:
|
|
std::atomic<uint64_t> value_ = Env::Default()->NowNanos();
|
|
};
|
|
|
|
using KeySet = std::set<uint32_t>;
|
|
class KeyGenerator {
|
|
public:
|
|
explicit KeyGenerator(uint32_t s, uint32_t low, uint32_t high,
|
|
KeySet&& existing_uniq, KeySet&& non_existing_uniq)
|
|
: rand_(s),
|
|
low_(low),
|
|
high_(high),
|
|
existing_uniq_(std::move(existing_uniq)),
|
|
non_existing_uniq_(std::move(non_existing_uniq)) {}
|
|
~KeyGenerator() {
|
|
assert(!existing_uniq_.empty());
|
|
assert(!non_existing_uniq_.empty());
|
|
}
|
|
void FinishInit();
|
|
|
|
std::pair<uint32_t, uint32_t> ChooseExisting();
|
|
void Replace(uint32_t old_val, uint32_t old_pos, uint32_t new_val);
|
|
uint32_t Allocate();
|
|
void UndoAllocation(uint32_t new_val);
|
|
|
|
std::string ToString() const {
|
|
std::ostringstream oss;
|
|
oss << "[" << low_ << ", " << high_ << "): " << existing_.size()
|
|
<< " elements, " << existing_uniq_.size() << " unique values, "
|
|
<< non_existing_uniq_.size() << " unique non-existing values";
|
|
return oss.str();
|
|
}
|
|
|
|
private:
|
|
Random rand_;
|
|
uint32_t low_ = 0;
|
|
uint32_t high_ = 0;
|
|
std::vector<uint32_t> existing_{};
|
|
KeySet existing_uniq_{};
|
|
KeySet non_existing_uniq_{};
|
|
bool initialized_ = false;
|
|
};
|
|
|
|
// Return <a, pos>
|
|
std::pair<uint32_t, uint32_t> ChooseExistingA(ThreadState* thread);
|
|
|
|
uint32_t GenerateNextA(ThreadState* thread);
|
|
|
|
// Return <c, pos>
|
|
std::pair<uint32_t, uint32_t> ChooseExistingC(ThreadState* thread);
|
|
|
|
uint32_t GenerateNextC(ThreadState* thread);
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
// Some applications, e.g. MyRocks writes a KV pair to the database via
|
|
// commit-time-write-batch (ctwb) in additional to the transaction's regular
|
|
// write batch. The key is usually constant representing some system
|
|
// metadata, while the value is monoticailly increasing which represents the
|
|
// actual value of the metadata. Method WriteToCommitTimeWriteBatch()
|
|
// emulates this scenario.
|
|
Status WriteToCommitTimeWriteBatch(Transaction& txn);
|
|
#endif //! ROCKSDB_LITE
|
|
|
|
std::vector<std::unique_ptr<KeyGenerator>> key_gen_for_a_;
|
|
std::vector<std::unique_ptr<KeyGenerator>> key_gen_for_c_;
|
|
|
|
Counter counter_{};
|
|
|
|
private:
|
|
struct KeySpaces {
|
|
uint32_t lb_a = 0;
|
|
uint32_t ub_a = 0;
|
|
uint32_t lb_c = 0;
|
|
uint32_t ub_c = 0;
|
|
|
|
explicit KeySpaces() = default;
|
|
explicit KeySpaces(uint32_t _lb_a, uint32_t _ub_a, uint32_t _lb_c,
|
|
uint32_t _ub_c)
|
|
: lb_a(_lb_a), ub_a(_ub_a), lb_c(_lb_c), ub_c(_ub_c) {}
|
|
|
|
std::string EncodeTo() const;
|
|
bool DecodeFrom(Slice data);
|
|
};
|
|
|
|
void PersistKeySpacesDesc(const std::string& key_spaces_path, uint32_t lb_a,
|
|
uint32_t ub_a, uint32_t lb_c, uint32_t ub_c);
|
|
|
|
KeySpaces ReadKeySpacesDesc(const std::string& key_spaces_path);
|
|
|
|
void PreloadDb(SharedState* shared, int threads, uint32_t lb_a, uint32_t ub_a,
|
|
uint32_t lb_c, uint32_t ub_c);
|
|
|
|
void ScanExistingDb(SharedState* shared, int threads);
|
|
};
|
|
|
|
class InvariantChecker {
|
|
public:
|
|
static_assert(sizeof(MultiOpsTxnsStressTest::Record().a_) == sizeof(uint32_t),
|
|
"MultiOpsTxnsStressTest::Record::a_ must be 4 bytes");
|
|
static_assert(sizeof(MultiOpsTxnsStressTest::Record().b_) == sizeof(uint32_t),
|
|
"MultiOpsTxnsStressTest::Record::b_ must be 4 bytes");
|
|
static_assert(sizeof(MultiOpsTxnsStressTest::Record().c_) == sizeof(uint32_t),
|
|
"MultiOpsTxnsStressTest::Record::c_ must be 4 bytes");
|
|
};
|
|
|
|
class MultiOpsTxnsStressListener : public EventListener {
|
|
public:
|
|
explicit MultiOpsTxnsStressListener(MultiOpsTxnsStressTest* stress_test)
|
|
: stress_test_(stress_test) {
|
|
assert(stress_test_);
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
~MultiOpsTxnsStressListener() override {}
|
|
|
|
void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
|
|
assert(db);
|
|
#ifdef NDEBUG
|
|
(void)db;
|
|
#endif
|
|
assert(info.cf_id == 0);
|
|
stress_test_->VerifyPkSkFast(info.job_id);
|
|
}
|
|
|
|
void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
|
|
assert(db);
|
|
#ifdef NDEBUG
|
|
(void)db;
|
|
#endif
|
|
assert(info.cf_id == 0);
|
|
stress_test_->VerifyPkSkFast(info.job_id);
|
|
}
|
|
#endif //! ROCKSDB_LITE
|
|
|
|
private:
|
|
MultiOpsTxnsStressTest* const stress_test_ = nullptr;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
#endif // GFLAGS
|