diff --git a/.gitignore b/.gitignore index d987e57679..55f9639d54 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ util/build_version.cc build_tools/VALGRIND_LOGS/ coverage/COVERAGE_REPORT util/build_version.cc.tmp +.gdbhistory diff --git a/db/db_impl.cc b/db/db_impl.cc index f19ecba6c0..7f1efdb889 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2203,7 +2203,6 @@ Status DBImpl::Get(const ReadOptions& options, Status DBImpl::GetImpl(const ReadOptions& options, const Slice& key, std::string* value, - const bool no_io, bool* value_found) { Status s; @@ -2242,7 +2241,7 @@ Status DBImpl::GetImpl(const ReadOptions& options, // Done } else { current->Get(options, lkey, value, &s, &merge_operands, &stats, - options_, no_io, value_found); + options_, value_found); have_stat_update = true; } mutex_.Lock(); @@ -2348,7 +2347,9 @@ bool DBImpl::KeyMayExist(const ReadOptions& options, if (value_found != nullptr) { *value_found = true; // falsify later if key-may-exist but can't fetch value } - return GetImpl(options, key, value, true, value_found).ok(); + ReadOptions roptions = options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + return GetImpl(roptions, key, value, value_found).ok(); } Iterator* DBImpl::NewIterator(const ReadOptions& options) { diff --git a/db/db_impl.h b/db/db_impl.h index 4d9b09c49f..6f4b5db426 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -424,7 +424,6 @@ class DBImpl : public DB { Status GetImpl(const ReadOptions& options, const Slice& key, std::string* value, - const bool no_io = false, bool* value_found = nullptr); }; diff --git a/db/db_test.cc b/db/db_test.cc index faa24c6bac..c64b620c5f 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -11,6 +11,7 @@ #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "db/db_statistics.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" @@ -829,6 +830,7 @@ TEST(DBTest, KeyMayExist) { std::string value; Options options = CurrentOptions(); options.filter_policy = NewBloomFilterPolicy(20); + options.statistics = leveldb::CreateDBStatistics(); Reopen(&options); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); @@ -841,24 +843,114 @@ TEST(DBTest, KeyMayExist) { dbfull()->Flush(FlushOptions()); value.clear(); - value_found = false; + + long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + long cache_miss = + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS); ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_miss, + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS)); ASSERT_OK(db_->Delete(WriteOptions(), "a")); + + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_miss, + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS)); dbfull()->Flush(FlushOptions()); dbfull()->CompactRange(nullptr, nullptr); + + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_miss, + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS)); ASSERT_OK(db_->Delete(WriteOptions(), "c")); + + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS); ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_miss, + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS)); delete options.filter_policy; } while (ChangeOptions()); } +TEST(DBTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = leveldb::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + Reopen(&options); + + // write one kv to the database. + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = db_->NewIterator(non_blocking_opts); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_TRUE(iter->status().ok()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + dbfull()->Flush(FlushOptions()); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + long cache_miss = + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS); + iter = db_->NewIterator(non_blocking_opts); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_miss, + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get("a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); + cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS); + iter = db_->NewIterator(non_blocking_opts); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_TRUE(iter->status().ok()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); + ASSERT_EQ(cache_miss, + options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS)); + delete iter; + + } while (ChangeOptions()); +} + // A delete is skipped for key if KeyMayExist(key) returns False // Tests Writebatch consistency and proper delete behaviour TEST(DBTest, FilterDeletes) { diff --git a/db/table_cache.cc b/db/table_cache.cc index 48d1177553..dc8769f48b 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -48,7 +48,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, *handle = cache_->Lookup(key); if (*handle == nullptr) { if (no_io) { // Dont do IO and return a not-found status - return Status::NotFound("Table not found in table_cache, no_io is set"); + return Status::Incomplete("Table not found in table_cache, no_io is set"); } if (table_io != nullptr) { *table_io = true; // we had to do IO from storage @@ -90,7 +90,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, } Cache::Handle* handle = nullptr; - Status s = FindTable(toptions, file_number, file_size, &handle); + Status s = FindTable(toptions, file_number, file_size, &handle, + nullptr, options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s); } @@ -117,17 +118,17 @@ Status TableCache::Get(const ReadOptions& options, void* arg, bool (*saver)(void*, const Slice&, const Slice&, bool), bool* table_io, - void (*mark_key_may_exist)(void*), - const bool no_io) { + void (*mark_key_may_exist)(void*)) { Cache::Handle* handle = nullptr; Status s = FindTable(storage_options_, file_number, file_size, - &handle, table_io, no_io); + &handle, table_io, + options.read_tier == kBlockCacheTier); if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle)); - s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_io); + s = t->InternalGet(options, k, arg, saver, mark_key_may_exist); cache_->Release(handle); - } else if (no_io && s.IsNotFound()) { + } else if (options.read_tier && s.IsIncomplete()) { // Couldnt find Table in cache but treat as kFound if no_io set (*mark_key_may_exist)(arg); return Status::OK(); diff --git a/db/table_cache.h b/db/table_cache.h index d7308020c0..c9e68738bc 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -49,8 +49,7 @@ class TableCache { void* arg, bool (*handle_result)(void*, const Slice&, const Slice&, bool), bool* table_io, - void (*mark_key_may_exist)(void*) = nullptr, - const bool no_io = false); + void (*mark_key_may_exist)(void*) = nullptr); // Determine whether the table may contain the specified prefix. If // the table index of blooms are not in memory, this may cause an I/O diff --git a/db/version_set.cc b/db/version_set.cc index dca8c72287..54be370bd9 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -415,7 +415,6 @@ void Version::Get(const ReadOptions& options, std::deque* operands, GetStats* stats, const Options& db_options, - const bool no_io, bool* value_found) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); @@ -425,9 +424,6 @@ void Version::Get(const ReadOptions& options, auto logger = db_options.info_log; assert(status->ok() || status->IsMergeInProgress()); - if (no_io) { - assert(status->ok()); - } Saver saver; saver.state = status->ok()? kNotFound : kMerge; saver.ucmp = ucmp; @@ -516,7 +512,7 @@ void Version::Get(const ReadOptions& options, bool tableIO = false; *status = vset_->table_cache_->Get(options, f->number, f->file_size, ikey, &saver, SaveValue, &tableIO, - MarkKeyMayExist, no_io); + MarkKeyMayExist); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; diff --git a/db/version_set.h b/db/version_set.h index 9a7aeb20b9..9a10682978 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -76,7 +76,7 @@ class Version { }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, Status* status, std::deque* operands, GetStats* stats, - const Options& db_option, const bool no_io = false, + const Options& db_option, bool* value_found = nullptr); // Adds "stats" into the current state. Returns true if a new diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index 9dde6d70f4..4270e95f7d 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -65,6 +65,8 @@ class Iterator { virtual Slice value() const = 0; // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; // Clients are allowed to register function/arg1/arg2 triples that diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index e3701af097..8e66811ac0 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -543,6 +543,18 @@ struct Options { std::shared_ptr compaction_filter_factory; }; +// +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies kBlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +enum ReadTier { + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1 // data in memtable or block cache +}; + // Options that control read operations struct ReadOptions { // If true, all data read from underlying storage will be @@ -575,15 +587,23 @@ struct ReadOptions { // Default: nullptr const Slice* prefix; + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::WouldBlock is returned. + // Default: kReadAllTier + ReadTier read_tier; + ReadOptions() : verify_checksums(false), fill_cache(true), snapshot(nullptr), - prefix(nullptr) { + prefix(nullptr), + read_tier(kReadAllTier) { } ReadOptions(bool cksum, bool cache) : verify_checksums(cksum), fill_cache(cache), - snapshot(nullptr), prefix(nullptr) { + snapshot(nullptr), prefix(nullptr), + read_tier(kReadAllTier) { } }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index f8cdbc7a1d..f3af5bfabf 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -50,6 +50,9 @@ class Status { static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kMergeInProgress, msg, msg2); } + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kIncomplete, msg, msg2); + } // Returns true iff the status indicates success. bool ok() const { return (state_ == nullptr); } @@ -72,6 +75,9 @@ class Status { // Returns true iff the status indicates an MergeInProgress. bool IsMergeInProgress() const { return code() == kMergeInProgress; } + // Returns true iff the status indicates Incomplete + bool IsIncomplete() const { return code() == kIncomplete; } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; @@ -92,6 +98,7 @@ class Status { kInvalidArgument = 4, kIOError = 5, kMergeInProgress = 6, + kIncomplete = 7 }; Code code() const { diff --git a/table/table.cc b/table/table.cc index 6d7ddb6ac2..f2b80cbbc3 100644 --- a/table/table.cc +++ b/table/table.cc @@ -237,8 +237,8 @@ Iterator* Table::BlockReader(void* arg, const ReadOptions& options, const Slice& index_value, bool* didIO, - bool for_compaction, - const bool no_io) { + bool for_compaction) { + const bool no_io = (options.read_tier == kBlockCacheTier); Table* table = reinterpret_cast(arg); Cache* block_cache = table->rep_->options.block_cache.get(); std::shared_ptr statistics = table->rep_->options.statistics; @@ -268,7 +268,8 @@ Iterator* Table::BlockReader(void* arg, RecordTick(statistics, BLOCK_CACHE_HIT); } else if (no_io) { - return nullptr; // Did not find in block_cache and can't do IO + // Did not find in block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); } else { Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; @@ -292,7 +293,8 @@ Iterator* Table::BlockReader(void* arg, RecordTick(statistics, BLOCK_CACHE_MISS); } } else if (no_io) { - return nullptr; // Could not read from block_cache and can't do IO + // Could not read from block_cache and can't do IO + return NewErrorIterator(Status::Incomplete("no blocking io")); }else { s = ReadBlock(table->rep_->file.get(), options, handle, &block, didIO); } @@ -401,8 +403,7 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, bool), - void (*mark_key_may_exist)(void*), - const bool no_io) { + void (*mark_key_may_exist)(void*)) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; @@ -421,9 +422,10 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, } else { bool didIO = false; Iterator* block_iter = BlockReader(this, options, iiter->value(), - &didIO, false, no_io); + &didIO); - if (no_io && !block_iter) { // couldn't get block from block_cache + if (options.read_tier && block_iter->status().IsIncomplete()) { + // couldn't get block from block_cache // Update Saver.state to Found because we are only looking for whether // we can guarantee the key is not there when "no_io" is set (*mark_key_may_exist)(arg); diff --git a/table/table.h b/table/table.h index a7014f911f..52d618f38b 100644 --- a/table/table.h +++ b/table/table.h @@ -79,8 +79,7 @@ class Table { const EnvOptions& soptions, const Slice&, bool for_compaction); static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, - bool* didIO, bool for_compaction = false, - const bool no_io = false); + bool* didIO, bool for_compaction = false); // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -90,8 +89,7 @@ class Table { const ReadOptions&, const Slice& key, void* arg, bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), - void (*mark_key_may_exist)(void*) = nullptr, - const bool no_io = false); + void (*mark_key_may_exist)(void*) = nullptr); void ReadMeta(const Footer& footer);