Drop column family from write thread

Summary: If we drop column family only from (single) write thread, we can be sure that nobody will drop the column family while we're writing (and our mutex is released). This greatly simplifies my patch that's getting rid of MakeRoomForWrite().

Test Plan: make check, but also running stress test

Reviewers: ljin, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22965
This commit is contained in:
Igor Canadi 2014-09-05 15:20:05 -07:00
parent 8de151bb99
commit 9f1c80b556
4 changed files with 98 additions and 23 deletions

View file

@ -77,20 +77,6 @@ const std::string kDefaultColumnFamilyName("default");
void DumpLeveldbBuildVersion(Logger * log); void DumpLeveldbBuildVersion(Logger * log);
// Information kept for every waiting writer
struct DBImpl::Writer {
Status status;
WriteBatch* batch;
bool sync;
bool disableWAL;
bool in_batch_group;
bool done;
uint64_t timeout_hint_us;
port::CondVar cv;
explicit Writer(port::Mutex* mu) : cv(mu) { }
};
struct DBImpl::WriteContext { struct DBImpl::WriteContext {
autovector<SuperVersion*> superversions_to_free_; autovector<SuperVersion*> superversions_to_free_;
autovector<log::Writer*> logs_to_free_; autovector<log::Writer*> logs_to_free_;
@ -3627,6 +3613,14 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
edit.DropColumnFamily(); edit.DropColumnFamily();
edit.SetColumnFamily(cfd->GetID()); edit.SetColumnFamily(cfd->GetID());
Writer w(&mutex_);
w.batch = nullptr;
w.sync = false;
w.disableWAL = false;
w.in_batch_group = false;
w.done = false;
w.timeout_hint_us = kNoTimeOut;
Status s; Status s;
{ {
MutexLock l(&mutex_); MutexLock l(&mutex_);
@ -3634,7 +3628,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
s = Status::InvalidArgument("Column family already dropped!\n"); s = Status::InvalidArgument("Column family already dropped!\n");
} }
if (s.ok()) { if (s.ok()) {
// we drop column family from a single write thread
s = BeginWrite(&w, 0);
assert(s.ok() && !w.done); // No timeout and nobody should do our job
s = versions_->LogAndApply(cfd, &edit, &mutex_); s = versions_->LogAndApply(cfd, &edit, &mutex_);
EndWrite(&w, &w, s);
} }
} }
@ -4173,15 +4171,19 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
break; break;
} }
if (w->batch != nullptr) { if (w->batch == nullptr) {
size += WriteBatchInternal::ByteSize(w->batch); // Do not include those writes with nullptr batch. Those are not writes,
if (size > max_size) { // those are something else. They want to be alone
// Do not make batch too big break;
break;
}
write_batch_group->push_back(w->batch);
} }
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
write_batch_group->push_back(w->batch);
w->in_batch_group = true; w->in_batch_group = true;
*last_writer = w; *last_writer = w;
} }

View file

@ -203,6 +203,17 @@ class DBImpl : public DB {
SequenceNumber* sequence); SequenceNumber* sequence);
Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
void TEST_LockMutex();
void TEST_UnlockMutex();
// REQUIRES: mutex locked
void* TEST_BeginWrite();
// REQUIRES: mutex locked
// pass the pointer that you got from TEST_BeginWrite()
void TEST_EndWrite(void* w);
#endif // NDEBUG #endif // NDEBUG
// Structure to store information for candidate files to delete. // Structure to store information for candidate files to delete.
@ -309,7 +320,7 @@ class DBImpl : public DB {
#endif #endif
friend struct SuperVersion; friend struct SuperVersion;
struct CompactionState; struct CompactionState;
struct Writer;
struct WriteContext; struct WriteContext;
Status NewDB(); Status NewDB();
@ -349,6 +360,20 @@ class DBImpl : public DB {
uint64_t SlowdownAmount(int n, double bottom, double top); uint64_t SlowdownAmount(int n, double bottom, double top);
// Information kept for every waiting writer
struct Writer {
Status status;
WriteBatch* batch;
bool sync;
bool disableWAL;
bool in_batch_group;
bool done;
uint64_t timeout_hint_us;
port::CondVar cv;
explicit Writer(port::Mutex* mu) : cv(mu) {}
};
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush) // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
// thread should grab the mutex_ and be the first on writers queue. // thread should grab the mutex_ and be the first on writers queue.
// BeginWrite is used for it. // BeginWrite is used for it.

View file

@ -130,5 +130,32 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
SequenceNumber* sequence) { SequenceNumber* sequence) {
return ReadFirstLine(fname, sequence); return ReadFirstLine(fname, sequence);
} }
void DBImpl::TEST_LockMutex() {
mutex_.Lock();
}
void DBImpl::TEST_UnlockMutex() {
mutex_.Unlock();
}
void* DBImpl::TEST_BeginWrite() {
auto w = new Writer(&mutex_);
w->batch = nullptr;
w->sync = false;
w->disableWAL = false;
w->in_batch_group = false;
w->done = false;
w->timeout_hint_us = kNoTimeOut;
Status s = BeginWrite(w, 0);
assert(s.ok() && !w->done); // No timeout and nobody should do our job
return reinterpret_cast<void*>(w);
}
void DBImpl::TEST_EndWrite(void* w) {
auto writer = reinterpret_cast<Writer*>(w);
EndWrite(writer, writer, Status::OK());
}
} // namespace rocksdb } // namespace rocksdb
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

View file

@ -11,6 +11,7 @@
#include <iostream> #include <iostream>
#include <set> #include <set>
#include <unistd.h> #include <unistd.h>
#include <thread>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
@ -7894,6 +7895,26 @@ TEST(DBTest, DBIteratorBoundTest) {
} }
} }
TEST(DBTest, WriteSingleThreadEntry) {
std::vector<std::thread> threads;
dbfull()->TEST_LockMutex();
auto w = dbfull()->TEST_BeginWrite();
threads.emplace_back([&] { Put("a", "b"); });
env_->SleepForMicroseconds(10000);
threads.emplace_back([&] { Flush(); });
env_->SleepForMicroseconds(10000);
dbfull()->TEST_UnlockMutex();
dbfull()->TEST_LockMutex();
dbfull()->TEST_EndWrite(w);
dbfull()->TEST_UnlockMutex();
for (auto& t : threads) {
t.join();
}
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {