Better locking in vectorrep that increases throughput to match speed of storage.

Summary:
There is a use-case where we want to insert data into rocksdb as
fast as possible. Vector rep is used for this purpose.

The background flush thread needs to flush the vectorrep to
storage. It acquires the dblock then sorts the vector, releases
the dblock and then writes the sorted vector to storage. This is
suboptimal because the lock is held during the sort, which
prevents new writes for occuring.

This patch moves the sorting of the vector rep to outside the
db mutex. Performance is now as fastas the underlying storage
system. If you are doing buffered writes to rocksdb files, then
you can observe throughput upwards of 200 MB/sec writes.

This is an early draft and not yet ready to be reviewed.

Test Plan:
make check

Task ID: #

Blame Rev:

Reviewers: haobo

Reviewed By: haobo

CC: leveldb, haobo

Differential Revision: https://reviews.facebook.net/D12987
This commit is contained in:
Dhruba Borthakur 2013-09-17 14:11:04 -07:00
parent 433541823c
commit 5e9f3a9aa7
2 changed files with 102 additions and 20 deletions

View file

@ -13,6 +13,7 @@
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "port/port.h" #include "port/port.h"
@ -79,6 +80,7 @@ static const char* FLAGS_benchmarks =
"snappycomp," "snappycomp,"
"snappyuncomp," "snappyuncomp,"
"acquireload," "acquireload,"
"fillfromstdin,"
; ;
// the maximum size of key in bytes // the maximum size of key in bytes
static const int MAX_KEY_SIZE = 128; static const int MAX_KEY_SIZE = 128;
@ -906,6 +908,9 @@ class Benchmark {
} else if (name == Slice("fillrandom")) { } else if (name == Slice("fillrandom")) {
fresh_db = true; fresh_db = true;
method = &Benchmark::WriteRandom; method = &Benchmark::WriteRandom;
} else if (name == Slice("fillfromstdin")) {
fresh_db = true;
method = &Benchmark::WriteFromStdin;
} else if (name == Slice("filluniquerandom")) { } else if (name == Slice("filluniquerandom")) {
fresh_db = true; fresh_db = true;
if (num_threads > 1) { if (num_threads > 1) {
@ -1342,6 +1347,54 @@ class Benchmark {
DoWrite(thread, UNIQUE_RANDOM); DoWrite(thread, UNIQUE_RANDOM);
} }
void writeOrFail(WriteBatch& batch) {
Status s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
}
void WriteFromStdin(ThreadState* thread) {
size_t count = 0;
WriteBatch batch;
const size_t bufferLen = 32 << 20;
unique_ptr<char[]> line = unique_ptr<char[]>(new char[bufferLen]);
char* linep = line.get();
const int batchSize = 100 << 10;
const char columnSeparator = '\t';
const char lineSeparator = '\n';
while (fgets(linep, bufferLen, stdin) != nullptr) {
++count;
char* tab = std::find(linep, linep + bufferLen, columnSeparator);
if (tab == linep + bufferLen) {
fprintf(stderr, "[Error] No Key delimiter TAB at line %ld\n", count);
continue;
}
Slice key(linep, tab - linep);
tab++;
char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
if (endLine == linep + bufferLen) {
fprintf(stderr, "[Error] No ENTER at end of line # %ld\n", count);
continue;
}
Slice value(tab, endLine - tab);
thread->stats.FinishedSingleOp(db_);
thread->stats.AddBytes(endLine - linep - 1);
if (batch.Count() < batchSize) {
batch.Put(key, value);
continue;
}
writeOrFail(batch);
batch.Clear();
}
if (batch.Count() > 0) {
writeOrFail(batch);
}
}
void DoWrite(ThreadState* thread, WriteMode write_mode) { void DoWrite(ThreadState* thread, WriteMode write_mode) {
const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0; const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
const int num_ops = writes_ == 0 ? num_ : writes_ ; const int num_ops = writes_ == 0 ? num_ : writes_ ;
@ -2320,8 +2373,8 @@ int main(int argc, char** argv) {
} else { } else {
FLAGS_key_size = n; FLAGS_key_size = n;
} }
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--write_buffer_size=%lld%c", &ll, &junk) == 1) {
FLAGS_write_buffer_size = n; FLAGS_write_buffer_size = ll;
} else if (sscanf(argv[i], "--max_write_buffer_number=%d%c", &n, &junk) == 1) { } else if (sscanf(argv[i], "--max_write_buffer_number=%d%c", &n, &junk) == 1) {
FLAGS_max_write_buffer_number = n; FLAGS_max_write_buffer_number = n;
} else if (sscanf(argv[i], "--min_write_buffer_number_to_merge=%d%c", } else if (sscanf(argv[i], "--min_write_buffer_number_to_merge=%d%c",

View file

@ -36,11 +36,15 @@ class VectorRep : public MemTableRep {
virtual ~VectorRep() override { } virtual ~VectorRep() override { }
class Iterator : public MemTableRep::Iterator { class Iterator : public MemTableRep::Iterator {
class VectorRep* vrep_;
std::shared_ptr<std::vector<const char*>> bucket_; std::shared_ptr<std::vector<const char*>> bucket_;
typename std::vector<const char*>::const_iterator cit_; typename std::vector<const char*>::const_iterator mutable cit_;
const KeyComparator& compare_; const KeyComparator& compare_;
bool mutable sorted_;
void DoSort() const;
public: public:
explicit Iterator(std::shared_ptr<std::vector<const char*>> bucket, explicit Iterator(class VectorRep* vrep,
std::shared_ptr<std::vector<const char*>> bucket,
const KeyComparator& compare); const KeyComparator& compare);
// Initialize an iterator over the specified collection. // Initialize an iterator over the specified collection.
@ -82,11 +86,12 @@ class VectorRep : public MemTableRep {
virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override; virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override;
private: private:
friend class Iterator;
typedef std::vector<const char*> Bucket; typedef std::vector<const char*> Bucket;
std::shared_ptr<Bucket> bucket_; std::shared_ptr<Bucket> bucket_;
mutable port::RWMutex rwlock_; mutable port::RWMutex rwlock_;
bool immutable_ = false; bool immutable_;
bool sorted_ = false; bool sorted_;
const KeyComparator& compare_; const KeyComparator& compare_;
}; };
@ -119,16 +124,42 @@ size_t VectorRep::ApproximateMemoryUsage() {
VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
: bucket_(new Bucket(count)), : bucket_(new Bucket(count)),
immutable_(false),
sorted_(false),
compare_(compare) { } compare_(compare) { }
VectorRep::Iterator::Iterator(std::shared_ptr<std::vector<const char*>> bucket, VectorRep::Iterator::Iterator(class VectorRep* vrep,
std::shared_ptr<std::vector<const char*>> bucket,
const KeyComparator& compare) const KeyComparator& compare)
: bucket_(bucket), : vrep_(vrep),
cit_(bucket_->begin()), bucket_(bucket),
compare_(compare) { } cit_(nullptr),
compare_(compare),
sorted_(false) { }
void VectorRep::Iterator::DoSort() const {
// vrep is non-null means that we are working on an immutable memtable
if (!sorted_ && vrep_ != nullptr) {
WriteLock l(&vrep_->rwlock_);
if (!vrep_->sorted_) {
std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
cit_ = bucket_->begin();
vrep_->sorted_ = true;
}
sorted_ = true;
}
if (!sorted_) {
std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
cit_ = bucket_->begin();
sorted_ = true;
}
assert(sorted_);
assert(vrep_ == nullptr || vrep_->sorted_);
}
// Returns true iff the iterator is positioned at a valid node. // Returns true iff the iterator is positioned at a valid node.
bool VectorRep::Iterator::Valid() const { bool VectorRep::Iterator::Valid() const {
DoSort();
return cit_ != bucket_->end(); return cit_ != bucket_->end();
} }
@ -165,6 +196,7 @@ void VectorRep::Iterator::Prev() {
// Advance to the first entry with a key >= target // Advance to the first entry with a key >= target
void VectorRep::Iterator::Seek(const char* target) { void VectorRep::Iterator::Seek(const char* target) {
DoSort();
// Do binary search to find first value not less than the target // Do binary search to find first value not less than the target
cit_ = std::equal_range(bucket_->begin(), cit_ = std::equal_range(bucket_->begin(),
bucket_->end(), bucket_->end(),
@ -177,12 +209,14 @@ void VectorRep::Iterator::Seek(const char* target) {
// Position at the first entry in collection. // Position at the first entry in collection.
// Final state of iterator is Valid() iff collection is not empty. // Final state of iterator is Valid() iff collection is not empty.
void VectorRep::Iterator::SeekToFirst() { void VectorRep::Iterator::SeekToFirst() {
DoSort();
cit_ = bucket_->begin(); cit_ = bucket_->begin();
} }
// Position at the last entry in collection. // Position at the last entry in collection.
// Final state of iterator is Valid() iff collection is not empty. // Final state of iterator is Valid() iff collection is not empty.
void VectorRep::Iterator::SeekToLast() { void VectorRep::Iterator::SeekToLast() {
DoSort();
cit_ = bucket_->end(); cit_ = bucket_->end();
if (bucket_->size() != 0) { if (bucket_->size() != 0) {
--cit_; --cit_;
@ -190,21 +224,16 @@ void VectorRep::Iterator::SeekToLast() {
} }
std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() { std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() {
std::shared_ptr<Bucket> tmp;
ReadLock l(&rwlock_); ReadLock l(&rwlock_);
// Do not sort here. The sorting would be done the first time
// a Seek is performed on the iterator.
if (immutable_) { if (immutable_) {
rwlock_.Unlock(); return std::make_shared<Iterator>(this, bucket_, compare_);
rwlock_.WriteLock();
tmp = bucket_;
if (!sorted_) {
std::sort(tmp->begin(), tmp->end(), Compare(compare_));
sorted_ = true;
}
} else { } else {
std::shared_ptr<Bucket> tmp;
tmp.reset(new Bucket(*bucket_)); // make a copy tmp.reset(new Bucket(*bucket_)); // make a copy
std::sort(tmp->begin(), tmp->end(), Compare(compare_)); return std::make_shared<Iterator>(nullptr, tmp, compare_);
} }
return std::make_shared<Iterator>(tmp, compare_);
} }
} // anon namespace } // anon namespace