mirror of https://github.com/facebook/rocksdb.git
Allows Get and MultiGet to read directly from SST files.
Summary: Add kSstFileTier to ReadTier, which allows Get and MultiGet to read only directly from SST files and skip mem-tables. kSstFileTier = 0x2 // data in SST files. // Note that this ReadTier currently only supports // Get and MultiGet and does not support iterators. Test Plan: add new test in db_test. Reviewers: anthony, IslamAbdelRahman, rven, kradhakrishnan, sdong Reviewed By: sdong Subscribers: igor, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D53511
This commit is contained in:
parent
fe93bf9b5d
commit
4a8cbf4e31
|
@ -3,12 +3,13 @@
|
||||||
|
|
||||||
## 4.5.0 (2/5/2016)
|
## 4.5.0 (2/5/2016)
|
||||||
### Public API Changes
|
### Public API Changes
|
||||||
* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now doesn't include timers for mutexes.
|
* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
|
||||||
* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
|
* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
|
||||||
* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
|
* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
|
||||||
|
|
||||||
### New Features
|
### New Features
|
||||||
* ldb tool now supports operations to non-default column families.
|
* ldb tool now supports operations to non-default column families.
|
||||||
|
* Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
|
||||||
* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
|
* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
|
||||||
|
|
||||||
## 4.4.0 (1/14/2016)
|
## 4.4.0 (1/14/2016)
|
||||||
|
|
|
@ -275,7 +275,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
||||||
db_options_.delete_obsolete_files_period_micros),
|
db_options_.delete_obsolete_files_period_micros),
|
||||||
last_stats_dump_time_microsec_(0),
|
last_stats_dump_time_microsec_(0),
|
||||||
next_job_id_(1),
|
next_job_id_(1),
|
||||||
flush_on_destroy_(false),
|
has_unpersisted_data_(false),
|
||||||
env_options_(db_options_),
|
env_options_(db_options_),
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
wal_manager_(db_options_, env_options_),
|
wal_manager_(db_options_, env_options_),
|
||||||
|
@ -322,7 +322,8 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
|
||||||
DBImpl::~DBImpl() {
|
DBImpl::~DBImpl() {
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
|
|
||||||
if (!shutting_down_.load(std::memory_order_acquire) && flush_on_destroy_) {
|
if (!shutting_down_.load(std::memory_order_acquire) &&
|
||||||
|
has_unpersisted_data_) {
|
||||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||||
if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) {
|
if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) {
|
||||||
cfd->Ref();
|
cfd->Ref();
|
||||||
|
@ -3306,13 +3307,19 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
|
||||||
LookupKey lkey(key, snapshot);
|
LookupKey lkey(key, snapshot);
|
||||||
PERF_TIMER_STOP(get_snapshot_time);
|
PERF_TIMER_STOP(get_snapshot_time);
|
||||||
|
|
||||||
|
bool skip_memtable =
|
||||||
|
(read_options.read_tier == kPersistedTier && has_unpersisted_data_);
|
||||||
|
bool done = false;
|
||||||
|
if (!skip_memtable) {
|
||||||
if (sv->mem->Get(lkey, value, &s, &merge_context)) {
|
if (sv->mem->Get(lkey, value, &s, &merge_context)) {
|
||||||
// Done
|
done = true;
|
||||||
RecordTick(stats_, MEMTABLE_HIT);
|
RecordTick(stats_, MEMTABLE_HIT);
|
||||||
} else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
|
} else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
|
||||||
// Done
|
done = true;
|
||||||
RecordTick(stats_, MEMTABLE_HIT);
|
RecordTick(stats_, MEMTABLE_HIT);
|
||||||
} else {
|
}
|
||||||
|
}
|
||||||
|
if (!done) {
|
||||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||||
sv->current->Get(read_options, lkey, value, &s, &merge_context,
|
sv->current->Get(read_options, lkey, value, &s, &merge_context,
|
||||||
value_found);
|
value_found);
|
||||||
|
@ -3397,14 +3404,23 @@ std::vector<Status> DBImpl::MultiGet(
|
||||||
assert(mgd_iter != multiget_cf_data.end());
|
assert(mgd_iter != multiget_cf_data.end());
|
||||||
auto mgd = mgd_iter->second;
|
auto mgd = mgd_iter->second;
|
||||||
auto super_version = mgd->super_version;
|
auto super_version = mgd->super_version;
|
||||||
|
bool skip_memtable =
|
||||||
|
(read_options.read_tier == kPersistedTier && has_unpersisted_data_);
|
||||||
|
bool done = false;
|
||||||
|
if (!skip_memtable) {
|
||||||
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
|
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
|
||||||
// Done
|
done = true;
|
||||||
|
// TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
|
||||||
} else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
|
} else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
|
||||||
// Done
|
done = true;
|
||||||
} else {
|
// TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!done) {
|
||||||
PERF_TIMER_GUARD(get_from_output_files_time);
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
||||||
super_version->current->Get(read_options, lkey, value, &s,
|
super_version->current->Get(read_options, lkey, value, &s,
|
||||||
&merge_context);
|
&merge_context);
|
||||||
|
// TODO(?): RecordTick(stats_, MEMTABLE_MISS)?
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
|
@ -3843,6 +3859,10 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
|
||||||
|
|
||||||
Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
||||||
ColumnFamilyHandle* column_family) {
|
ColumnFamilyHandle* column_family) {
|
||||||
|
if (read_options.read_tier == kPersistedTier) {
|
||||||
|
return NewErrorIterator(Status::NotSupported(
|
||||||
|
"ReadTier::kPersistedData is not yet supported in iterators."));
|
||||||
|
}
|
||||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||||
auto cfd = cfh->cfd();
|
auto cfd = cfh->cfd();
|
||||||
|
|
||||||
|
@ -3949,6 +3969,10 @@ Status DBImpl::NewIterators(
|
||||||
const ReadOptions& read_options,
|
const ReadOptions& read_options,
|
||||||
const std::vector<ColumnFamilyHandle*>& column_families,
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||||
std::vector<Iterator*>* iterators) {
|
std::vector<Iterator*>* iterators) {
|
||||||
|
if (read_options.read_tier == kPersistedTier) {
|
||||||
|
return Status::NotSupported(
|
||||||
|
"ReadTier::kPersistedData is not yet supported in iterators.");
|
||||||
|
}
|
||||||
iterators->clear();
|
iterators->clear();
|
||||||
iterators->reserve(column_families.size());
|
iterators->reserve(column_families.size());
|
||||||
XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
|
XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new,
|
||||||
|
@ -4327,7 +4351,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||||
|
|
||||||
if (write_options.disableWAL) {
|
if (write_options.disableWAL) {
|
||||||
flush_on_destroy_ = true;
|
has_unpersisted_data_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t log_size = 0;
|
uint64_t log_size = 0;
|
||||||
|
|
|
@ -822,7 +822,10 @@ class DBImpl : public DB {
|
||||||
// they're unique
|
// they're unique
|
||||||
std::atomic<int> next_job_id_;
|
std::atomic<int> next_job_id_;
|
||||||
|
|
||||||
bool flush_on_destroy_; // Used when disableWAL is true.
|
// A flag indicating whether the current rocksdb database has any
|
||||||
|
// data that is not yet persisted into either WAL or SST file.
|
||||||
|
// Used when disableWAL is true.
|
||||||
|
bool has_unpersisted_data_;
|
||||||
|
|
||||||
static const int KEEP_LOG_FILE_NUM = 1000;
|
static const int KEEP_LOG_FILE_NUM = 1000;
|
||||||
// MSVC version 1800 still does not have constexpr for ::max()
|
// MSVC version 1800 still does not have constexpr for ::max()
|
||||||
|
|
129
db/db_test.cc
129
db/db_test.cc
|
@ -519,6 +519,135 @@ TEST_F(DBTest, PutSingleDeleteGet) {
|
||||||
kSkipUniversalCompaction | kSkipMergePut));
|
kSkipUniversalCompaction | kSkipMergePut));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DBTest, ReadFromPersistedTier) {
|
||||||
|
do {
|
||||||
|
Random rnd(301);
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
|
||||||
|
CreateAndReopenWithCF({"pikachu"}, options);
|
||||||
|
WriteOptions wopt;
|
||||||
|
wopt.disableWAL = (disableWAL == 1);
|
||||||
|
// 1st round: put but not flush
|
||||||
|
ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
|
||||||
|
ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
|
||||||
|
ASSERT_EQ("first", Get(1, "foo"));
|
||||||
|
ASSERT_EQ("one", Get(1, "bar"));
|
||||||
|
|
||||||
|
// Read directly from persited data.
|
||||||
|
ReadOptions ropt;
|
||||||
|
ropt.read_tier = kPersistedTier;
|
||||||
|
std::string value;
|
||||||
|
if (wopt.disableWAL) {
|
||||||
|
// as data has not yet being flushed, we expect not found.
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
|
||||||
|
ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multiget
|
||||||
|
std::vector<ColumnFamilyHandle*> multiget_cfs;
|
||||||
|
multiget_cfs.push_back(handles_[1]);
|
||||||
|
multiget_cfs.push_back(handles_[1]);
|
||||||
|
std::vector<Slice> multiget_keys;
|
||||||
|
multiget_keys.push_back("foo");
|
||||||
|
multiget_keys.push_back("bar");
|
||||||
|
std::vector<std::string> multiget_values;
|
||||||
|
auto statuses =
|
||||||
|
db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
|
||||||
|
if (wopt.disableWAL) {
|
||||||
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
||||||
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(statuses[0]);
|
||||||
|
ASSERT_OK(statuses[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2nd round: flush and put a new value in memtable.
|
||||||
|
ASSERT_OK(Flush(1));
|
||||||
|
ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
|
||||||
|
|
||||||
|
// once the data has been flushed, we are able to get the
|
||||||
|
// data when kPersistedTier is used.
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
|
||||||
|
ASSERT_EQ(value, "first");
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
|
||||||
|
ASSERT_EQ(value, "one");
|
||||||
|
if (wopt.disableWAL) {
|
||||||
|
ASSERT_TRUE(
|
||||||
|
db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
|
||||||
|
ASSERT_EQ(value, "hello");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expect same result in multiget
|
||||||
|
multiget_cfs.push_back(handles_[1]);
|
||||||
|
multiget_keys.push_back("rocksdb");
|
||||||
|
statuses =
|
||||||
|
db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
|
||||||
|
ASSERT_TRUE(statuses[0].ok());
|
||||||
|
ASSERT_EQ("first", multiget_values[0]);
|
||||||
|
ASSERT_TRUE(statuses[1].ok());
|
||||||
|
ASSERT_EQ("one", multiget_values[1]);
|
||||||
|
if (wopt.disableWAL) {
|
||||||
|
ASSERT_TRUE(statuses[2].IsNotFound());
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(statuses[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3rd round: delete and flush
|
||||||
|
ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
|
||||||
|
Flush(1);
|
||||||
|
ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
|
||||||
|
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
|
||||||
|
if (wopt.disableWAL) {
|
||||||
|
// Still expect finding the value as its delete has not yet being
|
||||||
|
// flushed.
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
|
||||||
|
ASSERT_EQ(value, "one");
|
||||||
|
} else {
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
|
||||||
|
ASSERT_EQ(value, "hello");
|
||||||
|
|
||||||
|
statuses =
|
||||||
|
db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
|
||||||
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
||||||
|
if (wopt.disableWAL) {
|
||||||
|
ASSERT_TRUE(statuses[1].ok());
|
||||||
|
ASSERT_EQ("one", multiget_values[1]);
|
||||||
|
} else {
|
||||||
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(statuses[2].ok());
|
||||||
|
ASSERT_EQ("hello", multiget_values[2]);
|
||||||
|
if (wopt.disableWAL == 0) {
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (ChangeOptions(kSkipHashCuckoo));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(DBTest, PersistedTierOnIterator) {
|
||||||
|
// The test needs to be changed if kPersistedTier is supported in iterator.
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
CreateAndReopenWithCF({"pikachu"}, options);
|
||||||
|
ReadOptions ropt;
|
||||||
|
ropt.read_tier = kPersistedTier;
|
||||||
|
|
||||||
|
auto* iter = db_->NewIterator(ropt, handles_[1]);
|
||||||
|
ASSERT_TRUE(iter->status().IsNotSupported());
|
||||||
|
delete iter;
|
||||||
|
|
||||||
|
std::vector<Iterator*> iters;
|
||||||
|
ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
|
||||||
|
Close();
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(DBTest, SingleDeleteFlush) {
|
TEST_F(DBTest, SingleDeleteFlush) {
|
||||||
// Test to check whether flushing preserves a single delete hidden
|
// Test to check whether flushing preserves a single delete hidden
|
||||||
// behind a put.
|
// behind a put.
|
||||||
|
|
|
@ -273,7 +273,7 @@ Status TableCache::Get(const ReadOptions& options,
|
||||||
if (handle != nullptr) {
|
if (handle != nullptr) {
|
||||||
ReleaseHandle(handle);
|
ReleaseHandle(handle);
|
||||||
}
|
}
|
||||||
} else if (options.read_tier && s.IsIncomplete()) {
|
} else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
|
||||||
// Couldn't find Table in cache but treat as kFound if no_io set
|
// Couldn't find Table in cache but treat as kFound if no_io set
|
||||||
get_context->MarkKeyMayExist();
|
get_context->MarkKeyMayExist();
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
|
|
@ -1295,7 +1295,11 @@ struct Options : public DBOptions, public ColumnFamilyOptions {
|
||||||
// resides in storage.
|
// resides in storage.
|
||||||
enum ReadTier {
|
enum ReadTier {
|
||||||
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
|
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
|
||||||
kBlockCacheTier = 0x1 // data in memtable or block cache
|
kBlockCacheTier = 0x1, // data in memtable or block cache
|
||||||
|
kPersistedTier = 0x2 // persisted data. When WAL is disabled, this option
|
||||||
|
// will skip data in memtable.
|
||||||
|
// Note that this ReadTier currently only supports
|
||||||
|
// Get and MultiGet and does not support iterators.
|
||||||
};
|
};
|
||||||
|
|
||||||
// Options that control read operations
|
// Options that control read operations
|
||||||
|
|
|
@ -1255,7 +1255,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
|
||||||
BlockIter biter;
|
BlockIter biter;
|
||||||
NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
|
NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
|
||||||
|
|
||||||
if (read_options.read_tier && biter.status().IsIncomplete()) {
|
if (read_options.read_tier == kBlockCacheTier &&
|
||||||
|
biter.status().IsIncomplete()) {
|
||||||
// couldn't get block from block_cache
|
// couldn't get block from block_cache
|
||||||
// Update Saver.state to Found because we are only looking for whether
|
// Update Saver.state to Found because we are only looking for whether
|
||||||
// we can guarantee the key is not there when "no_io" is set
|
// we can guarantee the key is not there when "no_io" is set
|
||||||
|
|
Loading…
Reference in New Issue