mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 02:44:18 +00:00
9d77bf8f7b
Summary: - Right now each read fragments the memtable range tombstones https://github.com/facebook/rocksdb/issues/4808. This PR explores the idea of fragmenting memtable range tombstones in the write path and reads can just read this cached fragmented tombstone without any fragmenting cost. This PR only does the caching for immutable memtable, and does so right before a memtable is added to an immutable memtable list. The fragmentation is done without holding mutex to minimize its performance impact. - db_bench is updated to print out the number of range deletions executed if there is any. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10380 Test Plan: - CI, added asserts in various places to check whether a fragmented range tombstone list should have been constructed. - Benchmark: as this PR only optimizes immutable memtable path, the number of writes in the benchmark is chosen such an immutable memtable is created and range tombstones are in that memtable. ``` single thread: ./db_bench --benchmarks=fillrandom,readrandom --writes_per_range_tombstone=1 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=500000 --reads=100000 --max_num_range_tombstones=100 multi_thread ./db_bench --benchmarks=fillrandom,readrandom --writes_per_range_tombstone=1 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=15000 --reads=20000 --threads=32 --max_num_range_tombstones=100 ``` Commit99cdf16464
is included in benchmark result. It was an earlier attempt where tombstones are fragmented for each write operation. Reader threads share it using a shared_ptr which would slow down multi-thread read performance as seen in benchmark results. Results are averaged over 5 runs. Single thread result: | Max # tombstones | main fillrandom micros/op |99cdf16464
| Post PR | main readrandom micros/op |99cdf16464
| Post PR | | ------------- | ------------- |------------- |------------- |------------- |------------- |------------- | | 0 |6.68 |6.57 |6.72 |4.72 |4.79 |4.54 | | 1 |6.67 |6.58 |6.62 |5.41 |4.74 |4.72 | | 10 |6.59 |6.5 |6.56 |7.83 |4.69 |4.59 | | 100 |6.62 |6.75 |6.58 |29.57 |5.04 |5.09 | | 1000 |6.54 |6.82 |6.61 |320.33 |5.22 |5.21 | 32-thread result: note that "Max # tombstones" is per thread. | Max # tombstones | main fillrandom micros/op |99cdf16464
| Post PR | main readrandom micros/op |99cdf16464
| Post PR | | ------------- | ------------- |------------- |------------- |------------- |------------- |------------- | | 0 |234.52 |260.25 |239.42 |5.06 |5.38 |5.09 | | 1 |236.46 |262.0 |231.1 |19.57 |22.14 |5.45 | | 10 |236.95 |263.84 |251.49 |151.73 |21.61 |5.73 | | 100 |268.16 |296.8 |280.13 |2308.52 |22.27 |6.57 | Reviewed By: ajkr Differential Revision: D37916564 Pulled By: cbi42 fbshipit-source-id: 05d6d2e16df26c374c57ddcca13a5bfe9d5b731e
345 lines
12 KiB
C++
345 lines
12 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
#include "db/db_test_util.h"
|
|
#include "db/memtable.h"
|
|
#include "db/range_del_aggregator.h"
|
|
#include "port/stack_trace.h"
|
|
#include "rocksdb/memtablerep.h"
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class DBMemTableTest : public DBTestBase {
|
|
public:
|
|
DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {}
|
|
};
|
|
|
|
class MockMemTableRep : public MemTableRep {
|
|
public:
|
|
explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
|
|
: MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
|
|
|
|
KeyHandle Allocate(const size_t len, char** buf) override {
|
|
return rep_->Allocate(len, buf);
|
|
}
|
|
|
|
void Insert(KeyHandle handle) override { rep_->Insert(handle); }
|
|
|
|
void InsertWithHint(KeyHandle handle, void** hint) override {
|
|
num_insert_with_hint_++;
|
|
EXPECT_NE(nullptr, hint);
|
|
last_hint_in_ = *hint;
|
|
rep_->InsertWithHint(handle, hint);
|
|
last_hint_out_ = *hint;
|
|
}
|
|
|
|
bool Contains(const char* key) const override { return rep_->Contains(key); }
|
|
|
|
void Get(const LookupKey& k, void* callback_args,
|
|
bool (*callback_func)(void* arg, const char* entry)) override {
|
|
rep_->Get(k, callback_args, callback_func);
|
|
}
|
|
|
|
size_t ApproximateMemoryUsage() override {
|
|
return rep_->ApproximateMemoryUsage();
|
|
}
|
|
|
|
Iterator* GetIterator(Arena* arena) override {
|
|
return rep_->GetIterator(arena);
|
|
}
|
|
|
|
void* last_hint_in() { return last_hint_in_; }
|
|
void* last_hint_out() { return last_hint_out_; }
|
|
int num_insert_with_hint() { return num_insert_with_hint_; }
|
|
|
|
private:
|
|
std::unique_ptr<MemTableRep> rep_;
|
|
void* last_hint_in_;
|
|
void* last_hint_out_;
|
|
int num_insert_with_hint_;
|
|
};
|
|
|
|
class MockMemTableRepFactory : public MemTableRepFactory {
|
|
public:
|
|
MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
|
|
Allocator* allocator,
|
|
const SliceTransform* transform,
|
|
Logger* logger) override {
|
|
SkipListFactory factory;
|
|
MemTableRep* skiplist_rep =
|
|
factory.CreateMemTableRep(cmp, allocator, transform, logger);
|
|
mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
|
|
return mock_rep_;
|
|
}
|
|
|
|
MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
|
|
Allocator* allocator,
|
|
const SliceTransform* transform,
|
|
Logger* logger,
|
|
uint32_t column_family_id) override {
|
|
last_column_family_id_ = column_family_id;
|
|
return CreateMemTableRep(cmp, allocator, transform, logger);
|
|
}
|
|
|
|
const char* Name() const override { return "MockMemTableRepFactory"; }
|
|
|
|
MockMemTableRep* rep() { return mock_rep_; }
|
|
|
|
bool IsInsertConcurrentlySupported() const override { return false; }
|
|
|
|
uint32_t GetLastColumnFamilyId() { return last_column_family_id_; }
|
|
|
|
private:
|
|
MockMemTableRep* mock_rep_;
|
|
// workaround since there's no std::numeric_limits<uint32_t>::max() yet.
|
|
uint32_t last_column_family_id_ = static_cast<uint32_t>(-1);
|
|
};
|
|
|
|
class TestPrefixExtractor : public SliceTransform {
|
|
public:
|
|
const char* Name() const override { return "TestPrefixExtractor"; }
|
|
|
|
Slice Transform(const Slice& key) const override {
|
|
const char* p = separator(key);
|
|
if (p == nullptr) {
|
|
return Slice();
|
|
}
|
|
return Slice(key.data(), p - key.data() + 1);
|
|
}
|
|
|
|
bool InDomain(const Slice& key) const override {
|
|
return separator(key) != nullptr;
|
|
}
|
|
|
|
bool InRange(const Slice& /*key*/) const override { return false; }
|
|
|
|
private:
|
|
const char* separator(const Slice& key) const {
|
|
return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
|
|
}
|
|
};
|
|
|
|
// Test that ::Add properly returns false when inserting duplicate keys
|
|
TEST_F(DBMemTableTest, DuplicateSeq) {
|
|
SequenceNumber seq = 123;
|
|
std::string value;
|
|
MergeContext merge_context;
|
|
Options options;
|
|
InternalKeyComparator ikey_cmp(options.comparator);
|
|
ReadRangeDelAggregator range_del_agg(&ikey_cmp,
|
|
kMaxSequenceNumber /* upper_bound */);
|
|
|
|
// Create a MemTable
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
options.memtable_factory = factory;
|
|
ImmutableOptions ioptions(options);
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
|
MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
// Write some keys and make sure it returns false on duplicates
|
|
ASSERT_OK(
|
|
mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */));
|
|
ASSERT_TRUE(
|
|
mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)
|
|
.IsTryAgain());
|
|
// Changing the type should still cause the duplicatae key
|
|
ASSERT_TRUE(
|
|
mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */)
|
|
.IsTryAgain());
|
|
// Changing the seq number will make the key fresh
|
|
ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2",
|
|
nullptr /* kv_prot_info */));
|
|
// Test with different types for duplicate keys
|
|
ASSERT_TRUE(
|
|
mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */)
|
|
.IsTryAgain());
|
|
ASSERT_TRUE(
|
|
mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */)
|
|
.IsTryAgain());
|
|
|
|
// Test the duplicate keys under stress
|
|
for (int i = 0; i < 10000; i++) {
|
|
bool insert_dup = i % 10 == 1;
|
|
if (!insert_dup) {
|
|
seq++;
|
|
}
|
|
Status s = mem->Add(seq, kTypeValue, "foo", "value" + std::to_string(seq),
|
|
nullptr /* kv_prot_info */);
|
|
if (insert_dup) {
|
|
ASSERT_TRUE(s.IsTryAgain());
|
|
} else {
|
|
ASSERT_OK(s);
|
|
}
|
|
}
|
|
delete mem;
|
|
|
|
// Test with InsertWithHint
|
|
options.memtable_insert_with_hint_prefix_extractor.reset(
|
|
new TestPrefixExtractor()); // which uses _ to extract the prefix
|
|
ioptions = ImmutableOptions(options);
|
|
mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
// Insert a duplicate key with _ in it
|
|
ASSERT_OK(
|
|
mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */));
|
|
ASSERT_TRUE(
|
|
mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)
|
|
.IsTryAgain());
|
|
delete mem;
|
|
|
|
// Test when InsertConcurrently will be invoked
|
|
options.allow_concurrent_memtable_write = true;
|
|
ioptions = ImmutableOptions(options);
|
|
mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
MemTablePostProcessInfo post_process_info;
|
|
ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value",
|
|
nullptr /* kv_prot_info */, true, &post_process_info));
|
|
ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value",
|
|
nullptr /* kv_prot_info */, true, &post_process_info)
|
|
.IsTryAgain());
|
|
delete mem;
|
|
}
|
|
|
|
// A simple test to verify that the concurrent merge writes is functional
|
|
TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
|
|
int num_ops = 1000;
|
|
std::string value;
|
|
MergeContext merge_context;
|
|
Options options;
|
|
// A merge operator that is not sensitive to concurrent writes since in this
|
|
// test we don't order the writes.
|
|
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
|
|
|
// Create a MemTable
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
options.memtable_factory = factory;
|
|
options.allow_concurrent_memtable_write = true;
|
|
ImmutableOptions ioptions(options);
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
|
MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
// Put 0 as the base
|
|
PutFixed64(&value, static_cast<uint64_t>(0));
|
|
ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */));
|
|
value.clear();
|
|
|
|
// Write Merge concurrently
|
|
ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() {
|
|
MemTablePostProcessInfo post_process_info1;
|
|
std::string v1;
|
|
for (int seq = 1; seq < num_ops / 2; seq++) {
|
|
PutFixed64(&v1, seq);
|
|
ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */,
|
|
true, &post_process_info1));
|
|
v1.clear();
|
|
}
|
|
});
|
|
ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() {
|
|
MemTablePostProcessInfo post_process_info2;
|
|
std::string v2;
|
|
for (int seq = num_ops / 2; seq < num_ops; seq++) {
|
|
PutFixed64(&v2, seq);
|
|
ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */,
|
|
true, &post_process_info2));
|
|
v2.clear();
|
|
}
|
|
});
|
|
write_thread1.join();
|
|
write_thread2.join();
|
|
|
|
Status status;
|
|
ReadOptions roptions;
|
|
SequenceNumber max_covering_tombstone_seq = 0;
|
|
LookupKey lkey("key", kMaxSequenceNumber);
|
|
bool res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status,
|
|
&merge_context, &max_covering_tombstone_seq, roptions,
|
|
false /* immutable_memtable */);
|
|
ASSERT_OK(status);
|
|
ASSERT_TRUE(res);
|
|
uint64_t ivalue = DecodeFixed64(Slice(value).data());
|
|
uint64_t sum = 0;
|
|
for (int seq = 0; seq < num_ops; seq++) {
|
|
sum += seq;
|
|
}
|
|
ASSERT_EQ(ivalue, sum);
|
|
|
|
delete mem;
|
|
}
|
|
|
|
TEST_F(DBMemTableTest, InsertWithHint) {
|
|
Options options;
|
|
options.allow_concurrent_memtable_write = false;
|
|
options.create_if_missing = true;
|
|
options.memtable_factory.reset(new MockMemTableRepFactory());
|
|
options.memtable_insert_with_hint_prefix_extractor.reset(
|
|
new TestPrefixExtractor());
|
|
options.env = env_;
|
|
Reopen(options);
|
|
MockMemTableRep* rep =
|
|
reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
|
|
->rep();
|
|
ASSERT_OK(Put("foo_k1", "foo_v1"));
|
|
ASSERT_EQ(nullptr, rep->last_hint_in());
|
|
void* hint_foo = rep->last_hint_out();
|
|
ASSERT_OK(Put("foo_k2", "foo_v2"));
|
|
ASSERT_EQ(hint_foo, rep->last_hint_in());
|
|
ASSERT_EQ(hint_foo, rep->last_hint_out());
|
|
ASSERT_OK(Put("foo_k3", "foo_v3"));
|
|
ASSERT_EQ(hint_foo, rep->last_hint_in());
|
|
ASSERT_EQ(hint_foo, rep->last_hint_out());
|
|
ASSERT_OK(Put("bar_k1", "bar_v1"));
|
|
ASSERT_EQ(nullptr, rep->last_hint_in());
|
|
void* hint_bar = rep->last_hint_out();
|
|
ASSERT_NE(hint_foo, hint_bar);
|
|
ASSERT_OK(Put("bar_k2", "bar_v2"));
|
|
ASSERT_EQ(hint_bar, rep->last_hint_in());
|
|
ASSERT_EQ(hint_bar, rep->last_hint_out());
|
|
ASSERT_EQ(5, rep->num_insert_with_hint());
|
|
ASSERT_OK(Put("NotInPrefixDomain", "vvv"));
|
|
ASSERT_EQ(5, rep->num_insert_with_hint());
|
|
ASSERT_EQ("foo_v1", Get("foo_k1"));
|
|
ASSERT_EQ("foo_v2", Get("foo_k2"));
|
|
ASSERT_EQ("foo_v3", Get("foo_k3"));
|
|
ASSERT_EQ("bar_v1", Get("bar_k1"));
|
|
ASSERT_EQ("bar_v2", Get("bar_k2"));
|
|
ASSERT_EQ("vvv", Get("NotInPrefixDomain"));
|
|
}
|
|
|
|
TEST_F(DBMemTableTest, ColumnFamilyId) {
|
|
// Verifies MemTableRepFactory is told the right column family id.
|
|
Options options;
|
|
options.env = CurrentOptions().env;
|
|
options.allow_concurrent_memtable_write = false;
|
|
options.create_if_missing = true;
|
|
options.memtable_factory.reset(new MockMemTableRepFactory());
|
|
DestroyAndReopen(options);
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
for (uint32_t cf = 0; cf < 2; ++cf) {
|
|
ASSERT_OK(Put(cf, "key", "val"));
|
|
ASSERT_OK(Flush(cf));
|
|
ASSERT_EQ(
|
|
cf, static_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
|
|
->GetLastColumnFamilyId());
|
|
}
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
int main(int argc, char** argv) {
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
return RUN_ALL_TESTS();
|
|
}
|