Allows Get and MultiGet to read directly from SST files.

Summary:
Add kSstFileTier to ReadTier, which allows Get and MultiGet to
read only directly from SST files and skip mem-tables.

    kSstFileTier = 0x2      // data in SST files.
                          // Note that this ReadTier currently only supports
                          // Get and MultiGet and does not support iterators.

Test Plan: add new test in db_test.

Reviewers: anthony, IslamAbdelRahman, rven, kradhakrishnan, sdong

Reviewed By: sdong

Subscribers: igor, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D53511
This commit is contained in:
Yueh-Hsuan Chiang 2016-02-09 11:20:22 -08:00
parent fe93bf9b5d
commit 4a8cbf4e31
7 changed files with 183 additions and 21 deletions

View file

@ -3,12 +3,13 @@
## 4.5.0 (2/5/2016)
### Public API Changes
* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now doesn't include timers for mutexes.
* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
### New Features
* ldb tool now supports operations to non-default column families.
* Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
## 4.4.0 (1/14/2016)

View file

@ -275,7 +275,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
db_options_.delete_obsolete_files_period_micros),
last_stats_dump_time_microsec_(0),
next_job_id_(1),
flush_on_destroy_(false),
has_unpersisted_data_(false),
env_options_(db_options_),
#ifndef ROCKSDB_LITE
wal_manager_(db_options_, env_options_),
@ -322,7 +322,8 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
DBImpl::~DBImpl() {
mutex_.Lock();
if (!shutting_down_.load(std::memory_order_acquire) && flush_on_destroy_) {
if (!shutting_down_.load(std::memory_order_acquire) &&
has_unpersisted_data_) {
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) {
cfd->Ref();
@ -3306,13 +3307,19 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
LookupKey lkey(key, snapshot);
PERF_TIMER_STOP(get_snapshot_time);
if (sv->mem->Get(lkey, value, &s, &merge_context)) {
// Done
RecordTick(stats_, MEMTABLE_HIT);
} else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
// Done
RecordTick(stats_, MEMTABLE_HIT);
} else {
bool skip_memtable =
(read_options.read_tier == kPersistedTier && has_unpersisted_data_);
bool done = false;
if (!skip_memtable) {
if (sv->mem->Get(lkey, value, &s, &merge_context)) {
done = true;
RecordTick(stats_, MEMTABLE_HIT);
} else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
done = true;
RecordTick(stats_, MEMTABLE_HIT);
}
}
if (!done) {
PERF_TIMER_GUARD(get_from_output_files_time);
sv->current->Get(read_options, lkey, value, &s, &merge_context,
value_found);
@ -3397,14 +3404,23 @@ std::vector<Status> DBImpl::MultiGet(
assert(mgd_iter != multiget_cf_data.end());
auto mgd = mgd_iter->second;
auto super_version = mgd->super_version;
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
// Done
} else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
// Done
} else {
bool skip_memtable =
(read_options.read_tier == kPersistedTier && has_unpersisted_data_);
bool done = false;
if (!skip_memtable) {
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
done = true;
// TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
} else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
done = true;
// TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
}
}
if (!done) {
PERF_TIMER_GUARD(get_from_output_files_time);
super_version->current->Get(read_options, lkey, value, &s,
&merge_context);
// TODO(?): RecordTick(stats_, MEMTABLE_MISS)?
}
if (s.ok()) {
@ -3843,6 +3859,10 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) {
if (read_options.read_tier == kPersistedTier) {
return NewErrorIterator(Status::NotSupported(
"ReadTier::kPersistedData is not yet supported in iterators."));
}
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
@ -3949,6 +3969,10 @@ Status DBImpl::NewIterators(
const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) {
if (read_options.read_tier == kPersistedTier) {
return Status::NotSupported(
"ReadTier::kPersistedData is not yet supported in iterators.");
}
iterators->clear();
iterators->reserve(column_families.size());
XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
@ -4327,7 +4351,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
PERF_TIMER_STOP(write_pre_and_post_process_time);
if (write_options.disableWAL) {
flush_on_destroy_ = true;
has_unpersisted_data_ = true;
}
uint64_t log_size = 0;

View file

@ -822,7 +822,10 @@ class DBImpl : public DB {
// they're unique
std::atomic<int> next_job_id_;
bool flush_on_destroy_; // Used when disableWAL is true.
// A flag indicating whether the current rocksdb database has any
// data that is not yet persisted into either WAL or SST file.
// Used when disableWAL is true.
bool has_unpersisted_data_;
static const int KEEP_LOG_FILE_NUM = 1000;
// MSVC version 1800 still does not have constexpr for ::max()

View file

@ -519,6 +519,135 @@ TEST_F(DBTest, PutSingleDeleteGet) {
kSkipUniversalCompaction | kSkipMergePut));
}
TEST_F(DBTest, ReadFromPersistedTier) {
do {
Random rnd(301);
Options options = CurrentOptions();
for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
CreateAndReopenWithCF({"pikachu"}, options);
WriteOptions wopt;
wopt.disableWAL = (disableWAL == 1);
// 1st round: put but not flush
ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
ASSERT_EQ("first", Get(1, "foo"));
ASSERT_EQ("one", Get(1, "bar"));
// Read directly from persited data.
ReadOptions ropt;
ropt.read_tier = kPersistedTier;
std::string value;
if (wopt.disableWAL) {
// as data has not yet being flushed, we expect not found.
ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
} else {
ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
}
// Multiget
std::vector<ColumnFamilyHandle*> multiget_cfs;
multiget_cfs.push_back(handles_[1]);
multiget_cfs.push_back(handles_[1]);
std::vector<Slice> multiget_keys;
multiget_keys.push_back("foo");
multiget_keys.push_back("bar");
std::vector<std::string> multiget_values;
auto statuses =
db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
if (wopt.disableWAL) {
ASSERT_TRUE(statuses[0].IsNotFound());
ASSERT_TRUE(statuses[1].IsNotFound());
} else {
ASSERT_OK(statuses[0]);
ASSERT_OK(statuses[1]);
}
// 2nd round: flush and put a new value in memtable.
ASSERT_OK(Flush(1));
ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
// once the data has been flushed, we are able to get the
// data when kPersistedTier is used.
ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
ASSERT_EQ(value, "first");
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
ASSERT_EQ(value, "one");
if (wopt.disableWAL) {
ASSERT_TRUE(
db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
} else {
ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
ASSERT_EQ(value, "hello");
}
// Expect same result in multiget
multiget_cfs.push_back(handles_[1]);
multiget_keys.push_back("rocksdb");
statuses =
db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
ASSERT_TRUE(statuses[0].ok());
ASSERT_EQ("first", multiget_values[0]);
ASSERT_TRUE(statuses[1].ok());
ASSERT_EQ("one", multiget_values[1]);
if (wopt.disableWAL) {
ASSERT_TRUE(statuses[2].IsNotFound());
} else {
ASSERT_OK(statuses[2]);
}
// 3rd round: delete and flush
ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
Flush(1);
ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
if (wopt.disableWAL) {
// Still expect finding the value as its delete has not yet being
// flushed.
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
ASSERT_EQ(value, "one");
} else {
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
}
ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
ASSERT_EQ(value, "hello");
statuses =
db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
ASSERT_TRUE(statuses[0].IsNotFound());
if (wopt.disableWAL) {
ASSERT_TRUE(statuses[1].ok());
ASSERT_EQ("one", multiget_values[1]);
} else {
ASSERT_TRUE(statuses[1].IsNotFound());
}
ASSERT_TRUE(statuses[2].ok());
ASSERT_EQ("hello", multiget_values[2]);
if (wopt.disableWAL == 0) {
DestroyAndReopen(options);
}
}
} while (ChangeOptions(kSkipHashCuckoo));
}
TEST_F(DBTest, PersistedTierOnIterator) {
// The test needs to be changed if kPersistedTier is supported in iterator.
Options options = CurrentOptions();
CreateAndReopenWithCF({"pikachu"}, options);
ReadOptions ropt;
ropt.read_tier = kPersistedTier;
auto* iter = db_->NewIterator(ropt, handles_[1]);
ASSERT_TRUE(iter->status().IsNotSupported());
delete iter;
std::vector<Iterator*> iters;
ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
Close();
}
TEST_F(DBTest, SingleDeleteFlush) {
// Test to check whether flushing preserves a single delete hidden
// behind a put.

View file

@ -273,7 +273,7 @@ Status TableCache::Get(const ReadOptions& options,
if (handle != nullptr) {
ReleaseHandle(handle);
}
} else if (options.read_tier && s.IsIncomplete()) {
} else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
// Couldn't find Table in cache but treat as kFound if no_io set
get_context->MarkKeyMayExist();
return Status::OK();

View file

@ -1294,8 +1294,12 @@ struct Options : public DBOptions, public ColumnFamilyOptions {
// the block cache. It will not page in data from the OS cache or data that
// resides in storage.
enum ReadTier {
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
kBlockCacheTier = 0x1 // data in memtable or block cache
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
kBlockCacheTier = 0x1, // data in memtable or block cache
kPersistedTier = 0x2 // persisted data. When WAL is disabled, this option
// will skip data in memtable.
// Note that this ReadTier currently only supports
// Get and MultiGet and does not support iterators.
};
// Options that control read operations

View file

@ -1255,7 +1255,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
BlockIter biter;
NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
if (read_options.read_tier && biter.status().IsIncomplete()) {
if (read_options.read_tier == kBlockCacheTier &&
biter.status().IsIncomplete()) {
// couldn't get block from block_cache
// Update Saver.state to Found because we are only looking for whether
// we can guarantee the key is not there when "no_io" is set