Enable flushing memtables from arbitrary column families

Summary: Removed default_cfd_ from all flush code paths. This means we can now flush memtables from arbitrary column families!

Test Plan: Added a new unit test

Reviewers: dhruba, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15789
This commit is contained in:
Igor Canadi 2014-01-30 17:48:42 -08:00
parent 9ca638a86d
commit 3615f534d1
5 changed files with 162 additions and 82 deletions

View File

@ -66,6 +66,9 @@ class ColumnFamilyTest {
Status Merge(int cf, const string& key, const string& value) { Status Merge(int cf, const string& key, const string& value) {
return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value)); return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
} }
Status Flush(int cf) {
return db_->Flush(FlushOptions(), handles_[cf]);
}
string Get(int cf, const string& key) { string Get(int cf, const string& key) {
ReadOptions options; ReadOptions options;
@ -238,6 +241,40 @@ TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
} }
} }
TEST(ColumnFamilyTest, FlushTest) {
ASSERT_OK(Open({"default"}));
CreateColumnFamilies({"one", "two"});
Close();
ASSERT_OK(Open({"default", "one", "two"}));
ASSERT_OK(Put(0, "foo", "v1"));
ASSERT_OK(Put(0, "bar", "v2"));
ASSERT_OK(Put(1, "mirko", "v3"));
ASSERT_OK(Put(0, "foo", "v2"));
ASSERT_OK(Put(2, "fodor", "v5"));
for (int i = 0; i < 3; ++i) {
Flush(i);
}
Close();
ASSERT_OK(Open({"default", "one", "two"}));
for (int iter = 0; iter <= 2; ++iter) {
ASSERT_EQ("v2", Get(0, "foo"));
ASSERT_EQ("v2", Get(0, "bar"));
ASSERT_EQ("v3", Get(1, "mirko"));
ASSERT_EQ("v5", Get(2, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
if (iter <= 1) {
// reopen
Close();
ASSERT_OK(Open({"default", "one", "two"}));
}
}
Close();
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -317,8 +317,12 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
DBImpl::~DBImpl() { DBImpl::~DBImpl() {
// Wait for background work to finish // Wait for background work to finish
if (flush_on_destroy_ && default_cfd_->mem()->GetFirstSequenceNumber() != 0) { if (flush_on_destroy_) {
FlushMemTable(FlushOptions()); for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
FlushMemTable(cfd, FlushOptions());
}
}
} }
mutex_.Lock(); mutex_.Lock();
shutting_down_.Release_Store(this); // Any non-nullptr value is ok shutting_down_.Release_Store(this); // Any non-nullptr value is ok
@ -979,6 +983,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->mem()->ApproximateMemoryUsage() > if (cfd->mem()->ApproximateMemoryUsage() >
cfd->options()->write_buffer_size) { cfd->options()->write_buffer_size) {
// If this asserts, it means that ColumnFamilyMemTablesImpl failed in
// filtering updates to already-flushed column families
assert(cfd->GetLogNumber() <= log_number);
auto iter = version_edits.find(cfd->GetID()); auto iter = version_edits.find(cfd->GetID());
assert(iter != version_edits.end()); assert(iter != version_edits.end());
VersionEdit* edit = &iter->second; VersionEdit* edit = &iter->second;
@ -1001,8 +1008,20 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
assert(iter != version_edits.end()); assert(iter != version_edits.end());
VersionEdit* edit = &iter->second; VersionEdit* edit = &iter->second;
// flush the final memtable if (cfd->GetLogNumber() > log_number) {
status = WriteLevel0TableForRecovery(cfd->mem(), edit); // Column family cfd has already flushed the data
// from log_number. Memtable has to be empty because
// we filter the updates based on log_number
// (in ColumnFamilyMemTablesImpl)
assert(cfd->mem()->GetFirstSequenceNumber() == 0);
assert(edit->NumEntries() == 0);
continue;
}
// flush the final memtable (if non-empty)
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
status = WriteLevel0TableForRecovery(cfd->mem(), edit);
}
// we still want to clear the memtable, even if the recovery failed // we still want to clear the memtable, even if the recovery failed
cfd->CreateNewMemtable(); cfd->CreateNewMemtable();
if (!status.ok()) { if (!status.ok()) {
@ -1016,6 +1035,12 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
// Since we already recovered log_number, we want all logs // Since we already recovered log_number, we want all logs
// with numbers `<= log_number` (includes this one) to be ignored // with numbers `<= log_number` (includes this one) to be ignored
edit->SetLogNumber(log_number + 1); edit->SetLogNumber(log_number + 1);
// we must mark the next log number as used, even though it's
// not actually used. that is because VersionSet assumes
// VersionSet::next_file_number_ always to be strictly greater than any
// log
// number
versions_->MarkFileNumberUsed(log_number + 1);
status = versions_->LogAndApply(cfd, edit, &mutex_); status = versions_->LogAndApply(cfd, edit, &mutex_);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
@ -1077,8 +1102,8 @@ Status DBImpl::WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit) {
return s; return s;
} }
Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit, std::vector<MemTable*>& mems, VersionEdit* edit,
uint64_t* filenumber) { uint64_t* filenumber) {
mutex_.AssertHeld(); mutex_.AssertHeld();
const uint64_t start_micros = env_->NowMicros(); const uint64_t start_micros = env_->NowMicros();
@ -1090,7 +1115,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber newest_snapshot = snapshots_.GetNewest();
const SequenceNumber earliest_seqno_in_memtable = const SequenceNumber earliest_seqno_in_memtable =
mems[0]->GetFirstSequenceNumber(); mems[0]->GetFirstSequenceNumber();
Version* base = default_cfd_->current(); Version* base = cfd->current();
base->Ref(); // it is likely that we do not need this reference base->Ref(); // it is likely that we do not need this reference
Status s; Status s;
{ {
@ -1127,7 +1152,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
// re-acquire the most current version // re-acquire the most current version
base = default_cfd_->current(); base = cfd->current();
// There could be multiple threads writing to its own level-0 file. // There could be multiple threads writing to its own level-0 file.
// The pending_outputs cannot be cleared here, otherwise this newly // The pending_outputs cannot be cleared here, otherwise this newly
@ -1149,7 +1174,7 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
// threads could be concurrently producing compacted files for // threads could be concurrently producing compacted files for
// that key range. // that key range.
if (base != nullptr && options_.max_background_compactions <= 1 && if (base != nullptr && options_.max_background_compactions <= 1 &&
options_.compaction_style == kCompactionStyleLevel) { cfd->options()->compaction_style == kCompactionStyleLevel) {
level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
} }
edit->AddFile(level, meta.number, meta.file_size, edit->AddFile(level, meta.number, meta.file_size,
@ -1165,12 +1190,13 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
return s; return s;
} }
Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
bool* madeProgress,
DeletionState& deletion_state) { DeletionState& deletion_state) {
mutex_.AssertHeld(); mutex_.AssertHeld();
assert(default_cfd_->imm()->size() != 0); assert(cfd->imm()->size() != 0);
if (!default_cfd_->imm()->IsFlushPending()) { if (!cfd->imm()->IsFlushPending()) {
Log(options_.info_log, "FlushMemTableToOutputFile already in progress"); Log(options_.info_log, "FlushMemTableToOutputFile already in progress");
return Status::IOError("FlushMemTableToOutputFile already in progress"); return Status::IOError("FlushMemTableToOutputFile already in progress");
} }
@ -1178,7 +1204,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
// Save the contents of the earliest memtable as a new Table // Save the contents of the earliest memtable as a new Table
uint64_t file_number; uint64_t file_number;
std::vector<MemTable*> mems; std::vector<MemTable*> mems;
default_cfd_->imm()->PickMemtablesToFlush(&mems); cfd->imm()->PickMemtablesToFlush(&mems);
if (mems.empty()) { if (mems.empty()) {
Log(options_.info_log, "Nothing in memstore to flush"); Log(options_.info_log, "Nothing in memstore to flush");
return Status::IOError("Nothing in memstore to flush"); return Status::IOError("Nothing in memstore to flush");
@ -1193,9 +1219,8 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
edit->SetPrevLogNumber(0); edit->SetPrevLogNumber(0);
// SetLogNumber(log_num) indicates logs with number smaller than log_num // SetLogNumber(log_num) indicates logs with number smaller than log_num
// will no longer be picked up for recovery. // will no longer be picked up for recovery.
edit->SetLogNumber( edit->SetLogNumber(mems.back()->GetNextLogNumber());
mems.back()->GetNextLogNumber() edit->SetColumnFamily(cfd->GetID());
);
std::vector<uint64_t> logs_to_delete; std::vector<uint64_t> logs_to_delete;
for (auto mem : mems) { for (auto mem : mems) {
@ -1203,7 +1228,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
} }
// This will release and re-acquire the mutex. // This will release and re-acquire the mutex.
Status s = WriteLevel0Table(mems, edit, &file_number); Status s = WriteLevel0Table(cfd, mems, edit, &file_number);
if (s.ok() && shutting_down_.Acquire_Load()) { if (s.ok() && shutting_down_.Acquire_Load()) {
s = Status::IOError( s = Status::IOError(
@ -1212,13 +1237,13 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
} }
// Replace immutable memtable with the generated Table // Replace immutable memtable with the generated Table
s = default_cfd_->imm()->InstallMemtableFlushResults( s = cfd->imm()->InstallMemtableFlushResults(
default_cfd_, mems, versions_.get(), s, &mutex_, options_.info_log.get(), cfd, mems, versions_.get(), s, &mutex_, options_.info_log.get(),
file_number, pending_outputs_, &deletion_state.memtables_to_free, file_number, pending_outputs_, &deletion_state.memtables_to_free,
db_directory_.get()); db_directory_.get());
if (s.ok()) { if (s.ok()) {
InstallSuperVersion(default_cfd_, deletion_state); InstallSuperVersion(cfd, deletion_state);
if (madeProgress) { if (madeProgress) {
*madeProgress = 1; *madeProgress = 1;
} }
@ -1239,7 +1264,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
Status DBImpl::CompactRange(const ColumnFamilyHandle& column_family, Status DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
const Slice* begin, const Slice* end, const Slice* begin, const Slice* end,
bool reduce_level, int target_level) { bool reduce_level, int target_level) {
Status s = FlushMemTable(FlushOptions()); Status s = FlushMemTable(default_cfd_, FlushOptions());
if (!s.ok()) { if (!s.ok()) {
LogFlush(options_.info_log); LogFlush(options_.info_log);
return s; return s;
@ -1382,8 +1407,12 @@ uint64_t DBImpl::CurrentVersionNumber() const {
Status DBImpl::Flush(const FlushOptions& options, Status DBImpl::Flush(const FlushOptions& options,
const ColumnFamilyHandle& column_family) { const ColumnFamilyHandle& column_family) {
Status status = FlushMemTable(options); mutex_.Lock();
return status; auto cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family.id);
mutex_.Unlock();
assert(cfd != nullptr);
return FlushMemTable(cfd, options);
} }
SequenceNumber DBImpl::GetLatestSequenceNumber() const { SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@ -1657,35 +1686,36 @@ Status DBImpl::TEST_CompactRange(int level,
return RunManualCompaction(level, output_level, begin, end); return RunManualCompaction(level, output_level, begin, end);
} }
Status DBImpl::FlushMemTable(const FlushOptions& options) { Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
const FlushOptions& options) {
// nullptr batch means just wait for earlier writes to be done // nullptr batch means just wait for earlier writes to be done
Status s = Write(WriteOptions(), nullptr); Status s = Write(WriteOptions(), nullptr);
if (s.ok() && options.wait) { if (s.ok() && options.wait) {
// Wait until the compaction completes // Wait until the compaction completes
s = WaitForFlushMemTable(); s = WaitForFlushMemTable(cfd);
} }
return s; return s;
} }
Status DBImpl::WaitForFlushMemTable() { Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
Status s; Status s;
// Wait until the compaction completes // Wait until the compaction completes
MutexLock l(&mutex_); MutexLock l(&mutex_);
while (default_cfd_->imm()->size() > 0 && bg_error_.ok()) { while (cfd->imm()->size() > 0 && bg_error_.ok()) {
bg_cv_.Wait(); bg_cv_.Wait();
} }
if (default_cfd_->imm()->size() != 0) { if (!bg_error_.ok()) {
s = bg_error_; s = bg_error_;
} }
return s; return s;
} }
Status DBImpl::TEST_FlushMemTable() { Status DBImpl::TEST_FlushMemTable() {
return FlushMemTable(FlushOptions()); return FlushMemTable(default_cfd_, FlushOptions());
} }
Status DBImpl::TEST_WaitForFlushMemTable() { Status DBImpl::TEST_WaitForFlushMemTable() {
return WaitForFlushMemTable(); return WaitForFlushMemTable(default_cfd_);
} }
Status DBImpl::TEST_WaitForCompact() { Status DBImpl::TEST_WaitForCompact() {
@ -1710,19 +1740,31 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
} else if (shutting_down_.Acquire_Load()) { } else if (shutting_down_.Acquire_Load()) {
// DB is being deleted; no more background compactions // DB is being deleted; no more background compactions
} else { } else {
bool is_flush_pending = default_cfd_->imm()->IsFlushPending(); bool is_flush_pending = false;
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->imm()->IsFlushPending()) {
is_flush_pending = true;
}
}
if (is_flush_pending && if (is_flush_pending &&
(bg_flush_scheduled_ < options_.max_background_flushes)) { (bg_flush_scheduled_ < options_.max_background_flushes)) {
// memtable flush needed // memtable flush needed
bg_flush_scheduled_++; bg_flush_scheduled_++;
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
} }
bool is_compaction_needed = false;
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->current()->NeedsCompaction()) {
is_compaction_needed = true;
break;
}
}
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable // Schedule BGWorkCompaction if there's a compaction pending (or a memtable
// flush, but the HIGH pool is not enabled). Do it only if // flush, but the HIGH pool is not enabled). Do it only if
// max_background_compactions hasn't been reached and, in case // max_background_compactions hasn't been reached and, in case
// bg_manual_only_ > 0, if it's a manual compaction. // bg_manual_only_ > 0, if it's a manual compaction.
if ((manual_compaction_ || default_cfd_->current()->NeedsCompaction() || if ((manual_compaction_ || is_compaction_needed ||
(is_flush_pending && (options_.max_background_flushes <= 0))) && (is_flush_pending && (options_.max_background_flushes <= 0))) &&
bg_compaction_scheduled_ < options_.max_background_compactions && bg_compaction_scheduled_ < options_.max_background_compactions &&
(!bg_manual_only_ || manual_compaction_)) { (!bg_manual_only_ || manual_compaction_)) {
@ -1744,11 +1786,14 @@ void DBImpl::BGWorkCompaction(void* db) {
Status DBImpl::BackgroundFlush(bool* madeProgress, Status DBImpl::BackgroundFlush(bool* madeProgress,
DeletionState& deletion_state) { DeletionState& deletion_state) {
Status stat; Status stat;
while (stat.ok() && default_cfd_->imm()->IsFlushPending()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
Log(options_.info_log, while (stat.ok() && cfd->imm()->IsFlushPending()) {
"BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d", Log(options_.info_log,
options_.max_background_flushes - bg_flush_scheduled_); "BackgroundCallFlush doing FlushMemTableToOutputFile with column "
stat = FlushMemTableToOutputFile(madeProgress, deletion_state); "family %u, flush slots available %d",
cfd->GetID(), options_.max_background_flushes - bg_flush_scheduled_);
stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state);
}
} }
return stat; return stat;
} }
@ -1871,20 +1916,24 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
} }
// TODO: remove memtable flush from formal compaction // TODO: remove memtable flush from formal compaction
while (default_cfd_->imm()->IsFlushPending()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
Log(options_.info_log, while (cfd->imm()->IsFlushPending()) {
"BackgroundCompaction doing FlushMemTableToOutputFile, compaction slots " Log(options_.info_log,
"available %d", "BackgroundCompaction doing FlushMemTableToOutputFile with column "
options_.max_background_compactions - bg_compaction_scheduled_); "family %d, compaction slots available %d",
Status stat = FlushMemTableToOutputFile(madeProgress, deletion_state); cfd->GetID(),
if (!stat.ok()) { options_.max_background_compactions - bg_compaction_scheduled_);
if (is_manual) { Status stat =
manual_compaction_->status = stat; FlushMemTableToOutputFile(cfd, madeProgress, deletion_state);
manual_compaction_->done = true; if (!stat.ok()) {
manual_compaction_->in_progress = false; if (is_manual) {
manual_compaction_ = nullptr; manual_compaction_->status = stat;
manual_compaction_->done = true;
manual_compaction_->in_progress = false;
manual_compaction_ = nullptr;
}
return stat;
} }
return stat;
} }
} }
@ -2285,7 +2334,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
LogFlush(options_.info_log); LogFlush(options_.info_log);
mutex_.Lock(); mutex_.Lock();
if (default_cfd_->imm()->IsFlushPending()) { if (default_cfd_->imm()->IsFlushPending()) {
FlushMemTableToOutputFile(nullptr, deletion_state); FlushMemTableToOutputFile(default_cfd_, nullptr, deletion_state);
bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
} }
mutex_.Unlock(); mutex_.Unlock();

View File

@ -286,7 +286,7 @@ class DBImpl : public DB {
// Flush the in-memory write buffer to storage. Switches to a new // Flush the in-memory write buffer to storage. Switches to a new
// log-file/memtable and writes a new descriptor iff successful. // log-file/memtable and writes a new descriptor iff successful.
Status FlushMemTableToOutputFile(bool* madeProgress, Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
DeletionState& deletion_state); DeletionState& deletion_state);
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
@ -298,8 +298,8 @@ class DBImpl : public DB {
// for the entire period. The second method WriteLevel0Table supports // for the entire period. The second method WriteLevel0Table supports
// concurrent flush memtables to storage. // concurrent flush memtables to storage.
Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit); Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit, Status WriteLevel0Table(ColumnFamilyData* cfd, std::vector<MemTable*>& mems,
uint64_t* filenumber); VersionEdit* edit, uint64_t* filenumber);
uint64_t SlowdownAmount(int n, double bottom, double top); uint64_t SlowdownAmount(int n, double bottom, double top);
Status MakeRoomForWrite(ColumnFamilyData* cfd, Status MakeRoomForWrite(ColumnFamilyData* cfd,
@ -308,10 +308,10 @@ class DBImpl : public DB {
autovector<WriteBatch*>* write_batch_group); autovector<WriteBatch*>* write_batch_group);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status FlushMemTable(const FlushOptions& options); Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
// Wait for memtable flushed // Wait for memtable flushed
Status WaitForFlushMemTable(); Status WaitForFlushMemTable(ColumnFamilyData* cfd);
void MaybeScheduleLogDBDeployStats(); void MaybeScheduleLogDBDeployStats();
static void BGLogDBDeployStats(void* db); static void BGLogDBDeployStats(void* db);

View File

@ -1130,10 +1130,12 @@ struct VersionSet::ManifestWriter {
Status status; Status status;
bool done; bool done;
port::CondVar cv; port::CondVar cv;
ColumnFamilyData* cfd;
VersionEdit* edit; VersionEdit* edit;
explicit ManifestWriter(port::Mutex* mu, VersionEdit* e) : explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd,
done(false), cv(mu), edit(e) {} VersionEdit* e)
: done(false), cv(mu), cfd(cfd), edit(e) {}
}; };
// A helper class so we can efficiently apply a whole sequence // A helper class so we can efficiently apply a whole sequence
@ -1374,7 +1376,6 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
next_file_number_(2), next_file_number_(2),
manifest_file_number_(0), // Filled by Recover() manifest_file_number_(0), // Filled by Recover()
last_sequence_(0), last_sequence_(0),
log_number_(0),
prev_log_number_(0), prev_log_number_(0),
num_levels_(options_->num_levels), num_levels_(options_->num_levels),
current_version_number_(0), current_version_number_(0),
@ -1428,7 +1429,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
mu->AssertHeld(); mu->AssertHeld();
// queue our request // queue our request
ManifestWriter w(mu, edit); ManifestWriter w(mu, column_family_data, edit);
manifest_writers_.push_back(&w); manifest_writers_.push_back(&w);
while (!w.done && &w != manifest_writers_.front()) { while (!w.done && &w != manifest_writers_.front()) {
w.cv.Wait(); w.cv.Wait();
@ -1447,8 +1448,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
assert(manifest_writers_.front() == &w); assert(manifest_writers_.front() == &w);
std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin(); std::deque<ManifestWriter*>::iterator iter = manifest_writers_.begin();
for (; iter != manifest_writers_.end(); ++iter) { for (; iter != manifest_writers_.end(); ++iter) {
if ((*iter)->cfd->GetID() != column_family_data->GetID()) {
// group commits across column families are not yet supported
break;
}
last_writer = *iter; last_writer = *iter;
LogAndApplyHelper(&builder, v, last_writer->edit, mu); LogAndApplyHelper(column_family_data, &builder, v, last_writer->edit, mu);
batch_edits.push_back(last_writer->edit); batch_edits.push_back(last_writer->edit);
} }
builder.SaveTo(v); builder.SaveTo(v);
@ -1564,7 +1569,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
if (s.ok()) { if (s.ok()) {
manifest_file_size_ = new_manifest_file_size; manifest_file_size_ = new_manifest_file_size;
AppendVersion(column_family_data, v); AppendVersion(column_family_data, v);
log_number_ = edit->log_number_;
column_family_data->SetLogNumber(edit->log_number_); column_family_data->SetLogNumber(edit->log_number_);
prev_log_number_ = edit->prev_log_number_; prev_log_number_ = edit->prev_log_number_;
@ -1596,15 +1600,16 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
return s; return s;
} }
void VersionSet::LogAndApplyHelper(Builder* builder, Version* v, void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder,
VersionEdit* edit, port::Mutex* mu) { Version* v, VersionEdit* edit,
port::Mutex* mu) {
mu->AssertHeld(); mu->AssertHeld();
if (edit->has_log_number_) { if (edit->has_log_number_) {
assert(edit->log_number_ >= log_number_); assert(edit->log_number_ >= cfd->GetLogNumber());
assert(edit->log_number_ < next_file_number_); assert(edit->log_number_ < next_file_number_);
} else { } else {
edit->SetLogNumber(log_number_); edit->SetLogNumber(cfd->GetLogNumber());
} }
if (!edit->has_prev_log_number_) { if (!edit->has_prev_log_number_) {
@ -1754,6 +1759,7 @@ Status VersionSet::Recover(
if (edit.has_log_number_) { if (edit.has_log_number_) {
cfd->SetLogNumber(edit.log_number_); cfd->SetLogNumber(edit.log_number_);
have_log_number = true;
} }
// if it is not column family add or column family drop, // if it is not column family add or column family drop,
@ -1764,11 +1770,6 @@ Status VersionSet::Recover(
builder->second->Apply(&edit); builder->second->Apply(&edit);
} }
if (edit.has_log_number_) {
log_number = edit.log_number_;
have_log_number = true;
}
if (edit.has_prev_log_number_) { if (edit.has_prev_log_number_) {
prev_log_number = edit.prev_log_number_; prev_log_number = edit.prev_log_number_;
have_prev_log_number = true; have_prev_log_number = true;
@ -1828,7 +1829,6 @@ Status VersionSet::Recover(
manifest_file_number_ = next_file; manifest_file_number_ = next_file;
next_file_number_ = next_file + 1; next_file_number_ = next_file + 1;
last_sequence_ = last_sequence; last_sequence_ = last_sequence;
log_number_ = log_number;
prev_log_number_ = prev_log_number; prev_log_number_ = prev_log_number;
Log(options_->info_log, "Recovered from manifest file:%s succeeded," Log(options_->info_log, "Recovered from manifest file:%s succeeded,"
@ -1839,7 +1839,7 @@ Status VersionSet::Recover(
(unsigned long)manifest_file_number_, (unsigned long)manifest_file_number_,
(unsigned long)next_file_number_, (unsigned long)next_file_number_,
(unsigned long)last_sequence_, (unsigned long)last_sequence_,
(unsigned long)log_number_, (unsigned long)log_number,
(unsigned long)prev_log_number_); (unsigned long)prev_log_number_);
for (auto cfd : *column_family_set_) { for (auto cfd : *column_family_set_) {
@ -2041,7 +2041,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
} }
if (edit.has_log_number_) { if (edit.has_log_number_) {
log_number = edit.log_number_; log_number = std::max(log_number, edit.log_number_);
have_log_number = true; have_log_number = true;
} }
@ -2090,7 +2090,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
manifest_file_number_ = next_file; manifest_file_number_ = next_file;
next_file_number_ = next_file + 1; next_file_number_ = next_file + 1;
last_sequence_ = last_sequence; last_sequence_ = last_sequence;
log_number_ = log_number;
prev_log_number_ = prev_log_number; prev_log_number_ = prev_log_number;
printf("manifest_file_number %lu next_file_number %lu last_sequence " printf("manifest_file_number %lu next_file_number %lu last_sequence "

View File

@ -344,10 +344,6 @@ class VersionSet {
// Mark the specified file number as used. // Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number); void MarkFileNumberUsed(uint64_t number);
// Return the current log file number. This is the biggest log_number from
// all column families
uint64_t LogNumber() const { return log_number_; }
// Return the log file number for the log file that is currently // Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file. // being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; } uint64_t PrevLogNumber() const { return prev_log_number_; }
@ -468,7 +464,6 @@ class VersionSet {
uint64_t next_file_number_; uint64_t next_file_number_;
uint64_t manifest_file_number_; uint64_t manifest_file_number_;
std::atomic<uint64_t> last_sequence_; std::atomic<uint64_t> last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
int num_levels_; int num_levels_;
@ -502,8 +497,8 @@ class VersionSet {
VersionSet(const VersionSet&); VersionSet(const VersionSet&);
void operator=(const VersionSet&); void operator=(const VersionSet&);
void LogAndApplyHelper(Builder*b, Version* v, void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
VersionEdit* edit, port::Mutex* mu); VersionEdit* edit, port::Mutex* mu);
}; };
} // namespace rocksdb } // namespace rocksdb