From 3bde41b5a3f71a67cfee67d2a26244b80c777148 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 16 Jul 2019 13:11:23 -0700 Subject: [PATCH] Move the filter readers out of the block cache (#5504) Summary: Currently, when the block cache is used for the filter block, it is not really the block itself that is stored in the cache but a FilterBlockReader object. Since this object is not pure data (it has, for instance, pointers that might dangle, including in one case a back pointer to the TableReader), it's not really sharable. To avoid the issues around this, the current code erases the cache entries when the TableReader is closed (which, BTW, is not sufficient since a concurrent TableReader might have picked up the object in the meantime). Instead of doing this, the patch moves the FilterBlockReader out of the cache altogether, and decouples the filter reader object from the filter block. In particular, instead of the TableReader owning, or caching/pinning the FilterBlockReader (based on the customer's settings), with the change the TableReader unconditionally owns the FilterBlockReader, which in turn owns/caches/pins the filter block. This change also enables us to reuse the code paths historically used for data blocks for filters as well. Note: Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a separate phase. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504 Test Plan: make asan_check Differential Revision: D16036974 Pulled By: ltamasi fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091 --- CMakeLists.txt | 1 + HISTORY.md | 3 +- TARGETS | 1 + db/db_block_cache_test.cc | 14 +- src.mk | 1 + table/block_based/block_based_filter_block.cc | 178 +++-- table/block_based/block_based_filter_block.h | 45 +- .../block_based_filter_block_test.cc | 318 ++++++--- table/block_based/block_based_table_reader.cc | 624 +++++++----------- table/block_based/block_based_table_reader.h | 69 +- table/block_based/cachable_entry.h | 1 + table/block_based/filter_block.h | 66 +- .../block_based/filter_block_reader_common.cc | 90 +++ .../block_based/filter_block_reader_common.h | 54 ++ table/block_based/full_filter_block.cc | 156 +++-- table/block_based/full_filter_block.h | 60 +- table/block_based/full_filter_block_test.cc | 204 ++++-- table/block_based/partitioned_filter_block.cc | 303 +++++---- table/block_based/partitioned_filter_block.h | 66 +- .../partitioned_filter_block_test.cc | 116 ++-- table/table_reader.h | 3 +- table/table_test.cc | 6 +- tools/sst_dump_tool.cc | 3 +- 23 files changed, 1393 insertions(+), 989 deletions(-) create mode 100644 table/block_based/filter_block_reader_common.cc create mode 100644 table/block_based/filter_block_reader_common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c47f9811ef..65904b8cae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -595,6 +595,7 @@ set(SOURCES table/block_based/block_prefix_index.cc table/block_based/data_block_hash_index.cc table/block_based/data_block_footer.cc + table/block_based/filter_block_reader_common.cc table/block_based/flush_block_policy.cc table/block_based/full_filter_block.cc table/block_based/index_builder.cc diff --git a/HISTORY.md b/HISTORY.md index 099c9f37e8..2e1e03f68d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,8 +6,9 @@ ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +* Index and filter blocks are now handled similarly to data blocks with regards to the block cache: instead of storing reader objects in the cache, only the blocks themselves are cached. In addition, index and filter blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. Moreover, index blocks can now use the compressed block cache (if any). * Partitions of partitioned indexes no longer affect the read amplification statistics. -* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. +* Due to the above refactoring, block cache eviction statistics for indexes and filters are temporarily broken. We plan to reintroduce them in a later phase. * options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. * Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. * Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. diff --git a/TARGETS b/TARGETS index 6ef3da179d..eda1051396 100644 --- a/TARGETS +++ b/TARGETS @@ -192,6 +192,7 @@ cpp_library( "table/block_based/block_prefix_index.cc", "table/block_based/data_block_footer.cc", "table/block_based/data_block_hash_index.cc", + "table/block_based/filter_block_reader_common.cc", "table/block_based/flush_block_policy.cc", "table/block_based/full_filter_block.cc", "table/block_based/index_builder.cc", diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 8eb73a23dd..77f37da0d4 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -365,11 +365,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); // set the cache capacity to the current usage cache->SetCapacity(index_bytes_insert + filter_bytes_insert); - // The index eviction statistics were broken by the refactoring that moved - // the index readers out of the block cache. Disabling these until we can + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can // bring the stats back. // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); // Note that the second key needs to be no longer than the first one. // Otherwise the second index block may not fit in cache. ASSERT_OK(Put(1, "key", "val")); @@ -380,13 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { index_bytes_insert); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), filter_bytes_insert); - // The index eviction statistics were broken by the refactoring that moved - // the index readers out of the block cache. Disabling these until we can + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can // bring the stats back. // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), // index_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), - filter_bytes_insert); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), + // filter_bytes_insert); } namespace { diff --git a/src.mk b/src.mk index bc49b7ce07..fe930d5f49 100644 --- a/src.mk +++ b/src.mk @@ -115,6 +115,7 @@ LIB_SOURCES = \ table/block_based/block_prefix_index.cc \ table/block_based/data_block_hash_index.cc \ table/block_based/data_block_footer.cc \ + table/block_based/filter_block_reader_common.cc \ table/block_based/flush_block_policy.cc \ table/block_based/full_filter_block.cc \ table/block_based/index_builder.cc \ diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc index e5a32e4635..5585b8441c 100644 --- a/table/block_based/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -13,6 +13,7 @@ #include "db/dbformat.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" #include "util/string_util.h" @@ -162,58 +163,120 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { } BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, bool _whole_key_filtering, - BlockContents&& contents, Statistics* stats) - : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), - policy_(table_opt.filter_policy.get()), - prefix_extractor_(prefix_extractor), - data_(nullptr), - offset_(nullptr), - num_(0), - base_lg_(0), - contents_(std::move(contents)) { - assert(policy_); - size_t n = contents_.data.size(); - if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array - base_lg_ = contents_.data[n - 1]; - uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); - if (last_word > n - 5) return; - data_ = contents_.data.data(); - offset_ = data_ + last_word; - num_ = (n - 5 - last_word) / 4; + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + assert(table()); + assert(table()->get_rep()); + assert(table()->get_rep()->filter_policy); +} + +std::unique_ptr BlockBasedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new BlockBasedFilterBlockReader(table, std::move(filter_block))); } bool BlockBasedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { assert(block_offset != kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - return MayMatch(key, block_offset); + return MayMatch(key, block_offset, no_io, get_context, lookup_context); } bool BlockBasedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { assert(block_offset != kNotValid); - return MayMatch(prefix, block_offset); + return MayMatch(prefix, block_offset, no_io, get_context, lookup_context); } -bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, - uint64_t block_offset) { - uint64_t index = block_offset >> base_lg_; - if (index < num_) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); - if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { - Slice filter = Slice(data_ + start, limit - start); - bool const may_match = policy_->KeyMayMatch(entry, filter); +bool BlockBasedFilterBlockReader::ParseFieldsFromBlock( + const BlockContents& contents, const char** data, const char** offset, + size_t* num, size_t* base_lg) { + assert(data); + assert(offset); + assert(num); + assert(base_lg); + + const size_t n = contents.data.size(); + if (n < 5) { // 1 byte for base_lg and 4 for start of offset array + return false; + } + + const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5); + if (last_word > n - 5) { + return false; + } + + *data = contents.data.data(); + *offset = (*data) + last_word; + *num = (n - 5 - last_word) / 4; + *base_lg = contents.data[n - 1]; + + return true; +} + +bool BlockBasedFilterBlockReader::MayMatch( + const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return true; // Errors are treated as potential matches + } + + const uint64_t index = block_offset >> base_lg; + if (index < num) { + const uint32_t start = DecodeFixed32(offset + index * 4); + const uint32_t limit = DecodeFixed32(offset + index * 4 + 4); + if (start <= limit && limit <= (uint32_t)(offset - data)) { + const Slice filter = Slice(data + start, limit - start); + + assert(table()); + assert(table()->get_rep()); + const FilterPolicy* const policy = table()->get_rep()->filter_policy; + + const bool may_match = policy->KeyMayMatch(entry, filter); if (may_match) { PERF_COUNTER_ADD(bloom_sst_hit_count, 1); return true; @@ -230,27 +293,54 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, } size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { - return num_ * 4 + 5 + (offset_ - data_); + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; } std::string BlockBasedFilterBlockReader::ToString() const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + nullptr /* lookup_context */, &filter_block); + if (!s.ok()) { + return std::string("Unable to retrieve filter block"); + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return std::string("Error parsing filter block"); + } + std::string result; result.reserve(1024); std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks"); - AppendItem(&result, s_fb, rocksdb::ToString(num_)); + AppendItem(&result, s_fb, rocksdb::ToString(num)); AppendItem(&result, s_bo, s_hd); - for (size_t index = 0; index < num_; index++) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + for (size_t index = 0; index < num; index++) { + uint32_t start = DecodeFixed32(offset + index * 4); + uint32_t limit = DecodeFixed32(offset + index * 4 + 4); if (start != limit) { result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n"); - Slice filter = Slice(data_ + start, limit - start); + Slice filter = Slice(data + start, limit - start); AppendItem(&result, start, filter.ToString(true)); } } return result; } + } // namespace rocksdb diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h index cd86ff5c8a..43dbc4f4f9 100644 --- a/table/block_based/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -22,7 +22,8 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "table/block_based/filter_block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/format.h" #include "util/hash.h" namespace rocksdb { @@ -75,42 +76,42 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { // A FilterBlockReader is used to parse filter from SST table. // KeyMayMatch and PrefixMayMatch would trigger filter checking -class BlockBasedFilterBlockReader : public FilterBlockReader { +class BlockBasedFilterBlockReader + : public FilterBlockReaderCommon { public: - // REQUIRES: "contents" and *policy must stay live while *this is live. - BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - bool whole_key_filtering, - BlockContents&& contents, Statistics* statistics); + BlockBasedFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + bool IsBlockBased() override { return true; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; size_t ApproximateMemoryUsage() const override; // convert this object to a human readable form std::string ToString() const override; private: - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - const char* data_; // Pointer to filter data (at block-start) - const char* offset_; // Pointer to beginning of offset array (at block-end) - size_t num_; // Number of entries in offset array - size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) - BlockContents contents_; + static bool ParseFieldsFromBlock(const BlockContents& contents, + const char** data, const char** offset, + size_t* num, size_t* base_lg); - bool MayMatch(const Slice& entry, uint64_t block_offset); - - // No copying allowed - BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); - void operator=(const BlockBasedFilterBlockReader&); + bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; }; + } // namespace rocksdb diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index 220888dd2f..70bbde96ac 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -10,6 +10,7 @@ #include "table/block_based/block_based_filter_block.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" @@ -41,28 +42,58 @@ class TestHashFilter : public FilterPolicy { } }; +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) { + // Initialize what Open normally does as much as necessary for the test + rep->cache_key_prefix_size = 10; + } +}; + class FilterBlockTest : public testing::Test { public: - TestHashFilter policy_; + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; - FilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); + FilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; + table_options_.filter_policy.reset(new TestHashFilter); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); } }; TEST_F(FilterBlockTest, EmptyBuilder) { BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - BlockContents block(builder.Finish()); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); + Slice slice(builder.Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); ASSERT_TRUE(reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(FilterBlockTest, SingleChunk) { @@ -77,30 +108,46 @@ TEST_F(FilterBlockTest, SingleChunk) { builder.StartBlock(300); builder.Add("hello"); ASSERT_EQ(5, builder.NumAdded()); - BlockContents block(builder.Finish()); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + Slice slice(builder.Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(FilterBlockTest, MultiChunk) { @@ -123,93 +170,139 @@ TEST_F(FilterBlockTest, MultiChunk) { builder.Add("box"); builder.Add("hello"); - BlockContents block(builder.Finish()); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); + Slice slice(builder.Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); // Check first filter - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check second filter - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check third filter (empty) ASSERT_TRUE(!reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check last filter - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } // Test for block based filter block // use new interface in FilterPolicy to create filter builder/reader class BlockBasedFilterBlockTest : public testing::Test { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; - BlockBasedFilterBlockTest() { + BlockBasedFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); - } - ~BlockBasedFilterBlockTest() override {} + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); + } }; TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(nullptr, table_options_); - BlockContents block(builder->Finish()); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); + Slice slice(builder->Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); delete builder; delete reader; @@ -226,30 +319,42 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { builder->Add("box"); builder->StartBlock(300); builder->Add("hello"); - BlockContents block(builder->Finish()); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); + Slice slice(builder->Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); delete builder; delete reader; @@ -276,65 +381,86 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { builder->Add("box"); builder->Add("hello"); - BlockContents block(builder->Finish()); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); + Slice slice(builder->Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); // Check first filter ASSERT_TRUE(reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check second filter ASSERT_TRUE(reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check third filter (empty) ASSERT_TRUE(!reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); // Check last filter ASSERT_TRUE(reader->KeyMayMatch( "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(reader->KeyMayMatch( "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader->KeyMayMatch( "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); delete builder; delete reader; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 26c1365c4e..a888603d72 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -85,6 +85,8 @@ Status ReadBlockFromFile( const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, bool for_compaction = false) { + assert(result); + BlockContents contents; BlockFetcher block_fetcher( file, prefetch_buffer, footer, options, handle, &contents, ioptions, @@ -99,6 +101,32 @@ Status ReadBlockFromFile( return s; } +Status ReadBlockFromFile( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + std::unique_ptr* result, const ImmutableCFOptions& ioptions, + bool do_uncompress, bool maybe_compressed, BlockType block_type, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, + SequenceNumber /* global_seqno */, size_t /* read_amp_bytes_per_bit */, + MemoryAllocator* memory_allocator, bool for_compaction = false) { + assert(result); + + result->reset(new BlockContents); + + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, options, handle, result->get(), ioptions, + do_uncompress, maybe_compressed, block_type, uncompression_dict, + cache_options, memory_allocator, nullptr, for_compaction); + + const Status s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + result->reset(); + } + + return s; +} + inline MemoryAllocator* GetMemoryAllocator( const BlockBasedTableOptions& table_options) { return table_options.block_cache.get() @@ -120,7 +148,6 @@ void DeleteCachedEntry(const Slice& /*key*/, void* value) { delete entry; } -void DeleteCachedFilterEntry(const Slice& key, void* value); void DeleteCachedUncompressionDictEntry(const Slice& key, void* value); // Release the cached entry and decrement its ref count. @@ -283,8 +310,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { // unmodified. static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { assert(table != nullptr); assert(table->get_rep()); assert(!pin || prefetch); @@ -304,7 +332,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { } } - *index_reader = new PartitionIndexReader(table, std::move(index_block)); + index_reader->reset( + new PartitionIndexReader(table, std::move(index_block))); return Status::OK(); } @@ -445,7 +474,7 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE @@ -472,8 +501,9 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { // unmodified. static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { assert(table != nullptr); assert(table->get_rep()); assert(!pin || prefetch); @@ -493,7 +523,8 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { } } - *index_reader = new BinarySearchIndexReader(table, std::move(index_block)); + index_reader->reset( + new BinarySearchIndexReader(table, std::move(index_block))); return Status::OK(); } @@ -532,7 +563,7 @@ class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE @@ -552,8 +583,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { static Status Create(const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_index_iter, bool use_cache, - bool prefetch, bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { assert(table != nullptr); assert(index_reader != nullptr); assert(!pin || prefetch); @@ -579,8 +611,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { // hard error. We can still fall back to the original binary search index. // So, Create will succeed regardless, from this point on. - auto new_index_reader = new HashIndexReader(table, std::move(index_block)); - *index_reader = new_index_reader; + index_reader->reset(new HashIndexReader(table, std::move(index_block))); // Get prefixes block BlockHandle prefixes_handle; @@ -636,7 +667,9 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { prefixes_meta_contents.data, &prefix_index); // TODO: log error if (s.ok()) { - new_index_reader->prefix_index_.reset(prefix_index); + HashIndexReader* const hash_index_reader = + static_cast(index_reader->get()); + hash_index_reader->prefix_index_.reset(prefix_index); } return Status::OK(); @@ -679,7 +712,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else if (prefix_index_) { usage += prefix_index_->ApproximateMemoryUsage(); @@ -1453,22 +1486,49 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( } } - { - // Find compression dictionary handle - bool found_compression_dict; - s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, - &rep_->compression_dict_handle); + // Find compression dictionary handle + bool found_compression_dict = false; + s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, + &rep_->compression_dict_handle); + if (!s.ok()) { + return s; } BlockBasedTableOptions::IndexType index_type = rep_->index_type; const bool use_cache = table_options.cache_index_and_filter_blocks; + // pin both index and filters, down to all partitions + const bool pin_all = + rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + // prefetch the first level of index const bool prefetch_index = prefetch_all || (table_options.pin_top_level_index_and_filter && index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // pin the first level of index + const bool pin_index = + pin_all || (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + + std::unique_ptr index_reader; + s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, + prefetch_index, pin_index, lookup_context, + &index_reader); + if (!s.ok()) { + return s; + } + + rep_->index_reader = std::move(index_reader); + + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_all) { + rep_->index_reader->CacheDependencies(pin_all); + } + // prefetch the first level of filter const bool prefetch_filter = prefetch_all || @@ -1476,83 +1536,36 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( rep_->filter_type == Rep::FilterType::kPartitionedFilter); // Partition fitlers cannot be enabled without partition indexes assert(!prefetch_filter || prefetch_index); - // pin both index and filters, down to all partitions - const bool pin_all = - rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; - // pin the first level of index - const bool pin_index = - pin_all || (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // pin the first level of filter const bool pin_filter = pin_all || (table_options.pin_top_level_index_and_filter && rep_->filter_type == Rep::FilterType::kPartitionedFilter); - IndexReader* index_reader = nullptr; - if (s.ok()) { - s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, - prefetch_index, pin_index, &index_reader, - lookup_context); - if (s.ok()) { - assert(index_reader != nullptr); - rep_->index_reader.reset(index_reader); - // The partitions of partitioned index are always stored in cache. They - // are hence follow the configuration for pin and prefetch regardless of - // the value of cache_index_and_filter_blocks + if (rep_->filter_policy) { + auto filter = new_table->CreateFilterBlockReader( + prefetch_buffer, use_cache, prefetch_filter, pin_filter, + lookup_context); + if (filter) { + // Refer to the comment above about paritioned indexes always being cached if (prefetch_all) { - rep_->index_reader->CacheDependencies(pin_all); + filter->CacheDependencies(pin_all); } - } else { - delete index_reader; - index_reader = nullptr; + + rep_->filter = std::move(filter); } } - // pre-fetching of blocks is turned on - // Will use block cache for meta-blocks access - // Always prefetch index and filter for level 0 // TODO(ajkr): also prefetch compression dictionary block // TODO(ajkr): also pin compression dictionary block when // `pin_l0_filter_and_index_blocks_in_cache == true`. - if (table_options.cache_index_and_filter_blocks) { - assert(table_options.block_cache != nullptr); - if (s.ok() && prefetch_filter) { - // Hack: Call GetFilter() to implicitly add filter to the block_cache - auto filter_entry = - new_table->GetFilter(rep_->table_prefix_extractor.get(), - /*prefetch_buffer=*/nullptr, /*no_io=*/false, - /*get_context=*/nullptr, lookup_context); - if (filter_entry.GetValue() != nullptr && prefetch_all) { - filter_entry.GetValue()->CacheDependencies( - pin_all, rep_->table_prefix_extractor.get()); - } - // if pin_filter is true then save it in rep_->filter_entry; it will be - // released in the destructor only, hence it will be pinned in the - // cache while this reader is alive - if (pin_filter) { - rep_->filter_entry = std::move(filter_entry); - } - } - } else { + if (!table_options.cache_index_and_filter_blocks) { std::unique_ptr compression_dict_block; - if (s.ok()) { - // Set filter block - if (rep_->filter_policy) { - const bool is_a_filter_partition = true; - auto filter = new_table->ReadFilter( - prefetch_buffer, rep_->filter_handle, !is_a_filter_partition, - rep_->table_prefix_extractor.get()); - rep_->filter.reset(filter); - // Refer to the comment above about paritioned indexes always being - // cached - if (filter && prefetch_all) { - filter->CacheDependencies(pin_all, - rep_->table_prefix_extractor.get()); - } - } - s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); + s = ReadCompressionDictBlock(prefetch_buffer, &compression_dict_block); + if (!s.ok()) { + return s; } - if (s.ok() && !rep_->compression_dict_handle.IsNull()) { + + if (!rep_->compression_dict_handle.IsNull()) { assert(compression_dict_block != nullptr); // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy rep_->uncompression_dict.reset(new UncompressionDict( @@ -1560,6 +1573,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics)); } } + + assert(s.ok()); return s; } @@ -1631,10 +1646,43 @@ Status BlockBasedTable::ReadMetaBlock(FilePrefetchBuffer* prefetch_buffer, return Status::OK(); } +template +class BlocklikeTraits; + +template <> +class BlocklikeTraits { + public: + static BlockContents* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */) { + return new BlockContents(std::move(contents)); + } + + static uint32_t GetNumRestarts(const BlockContents& /* contents */) { + return 0; + } +}; + +template <> +class BlocklikeTraits { + public: + static Block* Create(BlockContents&& contents, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, Statistics* statistics) { + return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, + statistics); + } + + static uint32_t GetNumRestarts(const Block& block) { + return block.NumRestarts(); + } +}; + +template Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, CachableEntry* block, + const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, GetContext* get_context) const { const size_t read_amp_bytes_per_bit = @@ -1654,7 +1702,7 @@ Status BlockBasedTable::GetDataBlockFromCache( block_type, get_context); if (cache_handle != nullptr) { block->SetCachedValue( - reinterpret_cast(block_cache->Value(cache_handle)), + reinterpret_cast(block_cache->Value(cache_handle)), block_cache, cache_handle); return s; } @@ -1698,16 +1746,17 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert uncompressed block into block cache if (s.ok()) { - std::unique_ptr block_holder( - new Block(std::move(contents), rep_->get_global_seqno(block_type), - read_amp_bytes_per_bit, statistics)); // uncompressed block + std::unique_ptr block_holder( + BlocklikeTraits::Create( + std::move(contents), rep_->get_global_seqno(block_type), + read_amp_bytes_per_bit, statistics)); // uncompressed block if (block_cache != nullptr && block_holder->own_bytes() && read_options.fill_cache) { size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle); + &DeleteCachedEntry, &cache_handle); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG @@ -1730,10 +1779,11 @@ Status BlockBasedTable::GetDataBlockFromCache( return s; } +template Status BlockBasedTable::PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - CachableEntry* cached_block, BlockContents* raw_block_contents, + CachableEntry* cached_block, BlockContents* raw_block_contents, CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, MemoryAllocator* memory_allocator, BlockType block_type, @@ -1757,7 +1807,7 @@ Status BlockBasedTable::PutDataBlockToCache( Status s; Statistics* statistics = ioptions.statistics; - std::unique_ptr block_holder; + std::unique_ptr block_holder; if (raw_block_comp_type != kNoCompression) { // Retrieve the uncompressed contents into a new buffer BlockContents uncompressed_block_contents; @@ -1771,11 +1821,13 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } - block_holder.reset(new Block(std::move(uncompressed_block_contents), seq_no, - read_amp_bytes_per_bit, statistics)); + block_holder.reset(BlocklikeTraits::Create( + std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit, + statistics)); } else { - block_holder.reset(new Block(std::move(*raw_block_contents), seq_no, - read_amp_bytes_per_bit, statistics)); + block_holder.reset(BlocklikeTraits::Create( + std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit, + statistics)); } // Insert compressed block into compressed block cache. @@ -1809,7 +1861,8 @@ Status BlockBasedTable::PutDataBlockToCache( size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle, priority); + &DeleteCachedEntry, &cache_handle, + priority); #ifndef NDEBUG block_cache->TEST_mark_as_data_block(block_cache_key, charge); #endif // NDEBUG @@ -1829,173 +1882,38 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } -FilterBlockReader* BlockBasedTable::ReadFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, - const bool is_a_filter_partition, - const SliceTransform* prefix_extractor) const { +std::unique_ptr BlockBasedTable::CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { auto& rep = rep_; - // TODO: We might want to unify with ReadBlockFromFile() if we start - // requiring checksum verification in Table::Open. - if (rep->filter_type == Rep::FilterType::kNoFilter) { - return nullptr; - } - BlockContents block; - - BlockFetcher block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), - filter_handle, &block, rep->ioptions, false /* decompress */, - false /*maybe_compressed*/, BlockType::kFilter, - UncompressionDict::GetEmptyDict(), rep->persistent_cache_options, - GetMemoryAllocator(rep->table_options)); - Status s = block_fetcher.ReadBlockContents(); - - if (!s.ok()) { - // Error reading the block - return nullptr; + auto filter_type = rep->filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + return std::unique_ptr(); } assert(rep->filter_policy); - auto filter_type = rep->filter_type; - if (rep->filter_type == Rep::FilterType::kPartitionedFilter && - is_a_filter_partition) { - filter_type = Rep::FilterType::kFullFilter; - } - switch (filter_type) { - case Rep::FilterType::kPartitionedFilter: { - return new PartitionedFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), nullptr, - rep->ioptions.statistics, rep->internal_comparator, this, - rep_->index_key_includes_seq, rep_->index_value_is_full); - } + case Rep::FilterType::kPartitionedFilter: + return PartitionedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); case Rep::FilterType::kBlockFilter: - return new BlockBasedFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->table_options, rep->whole_key_filtering, std::move(block), - rep->ioptions.statistics); + return BlockBasedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); - case Rep::FilterType::kFullFilter: { - auto filter_bits_reader = - rep->filter_policy->GetFilterBitsReader(block.data); - assert(filter_bits_reader != nullptr); - return new FullFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), filter_bits_reader, - rep->ioptions.statistics); - } + case Rep::FilterType::kFullFilter: + return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context); default: // filter_type is either kNoFilter (exited the function at the first if), // or it must be covered in this switch block assert(false); - return nullptr; + return std::unique_ptr(); } } -CachableEntry BlockBasedTable::GetFilter( - const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer, - bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const { - const BlockHandle& filter_blk_handle = rep_->filter_handle; - const bool is_a_filter_partition = true; - return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition, - no_io, get_context, lookup_context, prefix_extractor); -} - -CachableEntry BlockBasedTable::GetFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, - const bool is_a_filter_partition, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - const SliceTransform* prefix_extractor) const { - // If cache_index_and_filter_blocks is false, filter should be pre-populated. - // We will return rep_->filter anyway. rep_->filter can be nullptr if filter - // read fails at Open() time. We don't want to reload again since it will - // most probably fail again. - if (!is_a_filter_partition && - !rep_->table_options.cache_index_and_filter_blocks) { - return {rep_->filter.get(), /*cache=*/nullptr, /*cache_handle=*/nullptr, - /*own_value=*/false}; - } - - Cache* block_cache = rep_->table_options.block_cache.get(); - if (rep_->filter_policy == nullptr /* do not use filter */ || - block_cache == nullptr /* no block cache at all */) { - return CachableEntry(); - } - - if (!is_a_filter_partition && rep_->filter_entry.IsCached()) { - return {rep_->filter_entry.GetValue(), /*cache=*/nullptr, - /*cache_handle=*/nullptr, /*own_value=*/false}; - } - - PERF_TIMER_GUARD(read_filter_block_nanos); - - // Fetching from the cache - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - filter_blk_handle, cache_key); - - Cache::Handle* cache_handle = - GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context); - - FilterBlockReader* filter = nullptr; - size_t usage = 0; - bool is_cache_hit = false; - bool return_empty_reader = false; - if (cache_handle != nullptr) { - filter = - reinterpret_cast(block_cache->Value(cache_handle)); - usage = filter->ApproximateMemoryUsage(); - is_cache_hit = true; - } else if (no_io) { - // Do not invoke any io. - return_empty_reader = true; - } else { - filter = ReadFilter(prefetch_buffer, filter_blk_handle, - is_a_filter_partition, prefix_extractor); - if (filter != nullptr) { - usage = filter->ApproximateMemoryUsage(); - Status s = block_cache->Insert( - key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, - rep_->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - if (s.ok()) { - UpdateCacheInsertionMetrics(BlockType::kFilter, get_context, usage); - } else { - RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); - delete filter; - return_empty_reader = true; - } - } - } - - if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && - lookup_context) { - // Avoid making copy of block_key and cf_name when constructing the access - // record. - BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), - /*block_key=*/"", TraceType::kBlockTraceFilterBlock, - /*block_size=*/usage, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - /*no_insert=*/no_io, lookup_context->get_id); - block_cache_tracer_->WriteBlockAccess(access_record, key, - rep_->cf_name_for_tracing(), - /*referenced_key=*/nullptr); - } - - if (return_empty_reader) { - return CachableEntry(); - } - return {filter, cache_handle ? block_cache : nullptr, cache_handle, - /*own_value=*/false}; -} - CachableEntry BlockBasedTable::GetUncompressionDict( FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context) const { @@ -2178,6 +2096,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( } block.TransferTo(iter); + return iter; } @@ -2294,10 +2213,11 @@ Status BlockBasedTable::GetDataBlockFromCache( // If contents is non-null, it skips the cache lookup and disk read, since // the caller has already read it. In both cases, if ro.fill_cache is true, // it inserts the block into the block cache. +template Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const { assert(block_entry != nullptr); @@ -2347,17 +2267,18 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { no_insert = false; Statistics* statistics = rep_->ioptions.statistics; - bool do_decompress = - block_cache_compressed == nullptr && rep_->blocks_maybe_compressed; + const bool maybe_compressed = + block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed && !block_cache_compressed; CompressionType raw_block_comp_type; BlockContents raw_block_contents; if (!contents) { StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, - &raw_block_contents, rep_->ioptions, - do_decompress /* do uncompress */, rep_->blocks_maybe_compressed, - block_type, uncompression_dict, rep_->persistent_cache_options, + &raw_block_contents, rep_->ioptions, do_uncompress, + maybe_compressed, block_type, uncompression_dict, + rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), GetMemoryAllocatorForCompressedBlock(rep_->table_options)); s = block_fetcher.ReadBlockContents(); @@ -2387,21 +2308,25 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( uint64_t nkeys = 0; if (block_entry->GetValue()) { // Approximate the number of keys in the block using restarts. - nkeys = rep_->table_options.block_restart_interval * - block_entry->GetValue()->NumRestarts(); + nkeys = + rep_->table_options.block_restart_interval * + BlocklikeTraits::GetNumRestarts(*block_entry->GetValue()); usage = block_entry->GetValue()->ApproximateMemoryUsage(); } TraceType trace_block_type = TraceType::kTraceMax; switch (block_type) { - case BlockType::kIndex: - trace_block_type = TraceType::kBlockTraceIndexBlock; - break; case BlockType::kData: trace_block_type = TraceType::kBlockTraceDataBlock; break; + case BlockType::kFilter: + trace_block_type = TraceType::kBlockTraceFilterBlock; + break; case BlockType::kRangeDeletion: trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; break; + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; default: // This cannot happen. assert(false); @@ -2603,10 +2528,11 @@ void BlockBasedTable::MaybeLoadBlocksToCache( } } +template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction) const { assert(block_entry); @@ -2639,16 +2565,19 @@ Status BlockBasedTable::RetrieveBlock( return Status::Incomplete("no blocking io"); } - std::unique_ptr block; + const bool maybe_compressed = + block_type != BlockType::kFilter && rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed; + std::unique_ptr block; { StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, - rep_->ioptions, rep_->blocks_maybe_compressed, - rep_->blocks_maybe_compressed, block_type, uncompression_dict, - rep_->persistent_cache_options, rep_->get_global_seqno(block_type), + rep_->ioptions, do_uncompress, maybe_compressed, block_type, + uncompression_dict, rep_->persistent_cache_options, + rep_->get_global_seqno(block_type), block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit : 0, @@ -2665,6 +2594,22 @@ Status BlockBasedTable::RetrieveBlock( return s; } +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction) const; + +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction) const; + BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, std::unordered_map>* block_map) @@ -2733,10 +2678,7 @@ bool BlockBasedTable::PrefixMayMatch( Status s; // First, try check with full filter - auto filter_entry = - GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, /*no_io=*/false, - /*get_context=*/nullptr, lookup_context); - FilterBlockReader* filter = filter_entry.GetValue(); + FilterBlockReader* const filter = rep_->filter.get(); bool filter_checked = true; if (filter != nullptr) { if (!filter->IsBlockBased()) { @@ -2798,7 +2740,7 @@ bool BlockBasedTable::PrefixMayMatch( BlockHandle handle = iiter->value().handle; may_match = filter->PrefixMayMatch( prefix, prefix_extractor, handle.offset(), /*no_io=*/false, - /*const_key_ptr=*/nullptr, lookup_context); + /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context); } } } @@ -3273,7 +3215,7 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( bool BlockBasedTable::FullFilterKeyMayMatch( const ReadOptions& read_options, FilterBlockReader* filter, const Slice& internal_key, const bool no_io, - const SliceTransform* prefix_extractor, + const SliceTransform* prefix_extractor, GetContext* get_context, BlockCacheLookupContext* lookup_context) const { if (filter == nullptr || filter->IsBlockBased()) { return true; @@ -3281,20 +3223,21 @@ bool BlockBasedTable::FullFilterKeyMayMatch( Slice user_key = ExtractUserKey(internal_key); const Slice* const const_ikey_ptr = &internal_key; bool may_match = true; - if (filter->whole_key_filtering()) { + if (rep_->whole_key_filtering) { size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, - no_io, const_ikey_ptr, lookup_context); + no_io, const_ikey_ptr, get_context, lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0 && prefix_extractor->InDomain(user_key) && !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), - prefix_extractor, kNotValid, false, - const_ikey_ptr, lookup_context)) { + prefix_extractor, kNotValid, no_io, + const_ikey_ptr, get_context, + lookup_context)) { may_match = false; } if (may_match) { @@ -3312,7 +3255,7 @@ void BlockBasedTable::FullFilterKeysMayMatch( if (filter == nullptr || filter->IsBlockBased()) { return; } - if (filter->whole_key_filtering()) { + if (rep_->whole_key_filtering) { filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io, lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && @@ -3338,25 +3281,19 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, assert(get_context != nullptr); Status s; const bool no_io = read_options.read_tier == kBlockCacheTier; - CachableEntry filter_entry; - bool may_match; - FilterBlockReader* filter = nullptr; + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + + // First check the full filter + // If full filter not useful, Then go into each block uint64_t tracing_get_id = get_context->get_tracing_get_id(); BlockCacheLookupContext lookup_context{TableReaderCaller::kUserGet, tracing_get_id}; - { - if (!skip_filters) { - filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, - read_options.read_tier == kBlockCacheTier, - get_context, &lookup_context); - } - filter = filter_entry.GetValue(); + const bool may_match = + FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor, + get_context, &lookup_context); - // First check the full filter - // If full filter not useful, Then go into each block - may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io, - prefix_extractor, &lookup_context); - } if (!may_match) { RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); @@ -3388,7 +3325,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, filter != nullptr && filter->IsBlockBased() == true && !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), prefix_extractor, v.handle.offset(), no_io, - /*const_ikey_ptr=*/nullptr, &lookup_context); + /*const_ikey_ptr=*/nullptr, get_context, + &lookup_context); if (not_exist_in_filter) { // Not found @@ -3510,31 +3448,23 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, const MultiGetRange* mget_range, const SliceTransform* prefix_extractor, bool skip_filters) { - const bool no_io = read_options.read_tier == kBlockCacheTier; - CachableEntry filter_entry; - FilterBlockReader* filter = nullptr; + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; MultiGetRange sst_file_range(*mget_range, mget_range->begin(), mget_range->end()); + + // First check the full filter + // If full filter not useful, Then go into each block + const bool no_io = read_options.read_tier == kBlockCacheTier; uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); } BlockCacheLookupContext lookup_context{TableReaderCaller::kUserMultiGet, tracing_mget_id}; - if (!skip_filters) { - { - // TODO: Figure out where the stats should go - filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer=*/nullptr, - read_options.read_tier == kBlockCacheTier, - /*get_context=*/nullptr, &lookup_context); - } - filter = filter_entry.GetValue(); + FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, + prefix_extractor, &lookup_context); - // First check the full filter - // If full filter not useful, Then go into each block - FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, - prefix_extractor, &lookup_context); - } if (skip_filters || !sst_file_range.empty()) { IndexBlockIter iiter_on_stack; // if prefix_extractor found in block differs from options, disable @@ -4006,7 +3936,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr> iiter(NewIndexIterator( options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, - /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); iiter->Seek(key); assert(iiter->Valid()); @@ -4022,8 +3952,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, Status BlockBasedTable::CreateIndexReader( FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, - bool pin, IndexReader** index_reader, - BlockCacheLookupContext* lookup_context) { + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. @@ -4033,14 +3963,14 @@ Status BlockBasedTable::CreateIndexReader( switch (rep_->index_type) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, index_reader, - lookup_context); + prefetch, pin, lookup_context, + index_reader); } case BlockBasedTableOptions::kBinarySearch: case BlockBasedTableOptions::kBinarySearchWithFirstKey: { return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, index_reader, - lookup_context); + prefetch, pin, lookup_context, + index_reader); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -4056,14 +3986,14 @@ Status BlockBasedTable::CreateIndexReader( " Fall back to binary search index."); return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, prefetch, pin, - index_reader, lookup_context); + lookup_context, index_reader); } meta_index_iter = meta_iter_guard.get(); } return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, - use_cache, prefetch, pin, index_reader, - lookup_context); + use_cache, prefetch, pin, lookup_context, + index_reader); } default: { std::string error_message = @@ -4079,7 +4009,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, std::unique_ptr> index_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, - /*lookup_contex=*/&context)); + /*lookup_context=*/&context)); index_iter->Seek(key); uint64_t result; @@ -4102,8 +4032,9 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, return result; } -bool BlockBasedTable::TEST_filter_block_preloaded() const { - return rep_->filter != nullptr; +bool BlockBasedTable::TEST_FilterBlockInCache() const { + assert(rep_ != nullptr); + return TEST_BlockInCache(rep_->filter_handle); } bool BlockBasedTable::TEST_IndexBlockInCache() const { @@ -4167,8 +4098,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( return Status::OK(); } -Status BlockBasedTable::DumpTable(WritableFile* out_file, - const SliceTransform* prefix_extractor) { +Status BlockBasedTable::DumpTable(WritableFile* out_file) { // Output Footer out_file->Append( "Footer Details:\n" @@ -4225,36 +4155,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file, " "); out_file->Append(table_properties->ToString("\n ", ": ").c_str()); out_file->Append("\n"); - - // Output Filter blocks - if (!rep_->filter && !table_properties->filter_policy_name.empty()) { - // Support only BloomFilter as off now - rocksdb::BlockBasedTableOptions table_options; - table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); - if (table_properties->filter_policy_name.compare( - table_options.filter_policy->Name()) == 0) { - std::string filter_block_key = kFilterBlockPrefix; - filter_block_key.append(table_properties->filter_policy_name); - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { - BlockContents block; - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, - ReadOptions(), handle, &block, rep_->ioptions, - false /*decompress*/, false /*maybe_compressed*/, - BlockType::kFilter, UncompressionDict::GetEmptyDict(), - rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (!s.ok()) { - rep_->filter.reset(new BlockBasedFilterBlockReader( - prefix_extractor, table_options, - table_options.whole_key_filtering, std::move(block), - rep_->ioptions.statistics)); - } - } - } - } } + if (rep_->filter) { out_file->Append( "Filter Details:\n" @@ -4318,22 +4220,17 @@ void BlockBasedTable::Close() { return; } - Cache* const cache = rep_->table_options.block_cache.get(); - // cleanup index, filter, and compression dictionary blocks // to avoid accessing dangling pointers if (!rep_->table_options.no_block_cache) { - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - - // Get the filter block key - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->filter_handle, cache_key); - cache->Erase(key); - if (!rep_->compression_dict_handle.IsNull()) { // Get the compression dictionary block key - key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->compression_dict_handle, cache_key); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->compression_dict_handle, cache_key); + + Cache* const cache = rep_->table_options.block_cache.get(); cache->Erase(key); } } @@ -4518,15 +4415,6 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, namespace { -void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { - FilterBlockReader* filter = reinterpret_cast(value); - if (filter->statistics() != nullptr) { - RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, - filter->ApproximateMemoryUsage()); - } - delete filter; -} - void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) { UncompressionDict* dict = reinterpret_cast(value); RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 750700813d..189cd5d2e3 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -172,8 +172,7 @@ class BlockBasedTable : public TableReader { size_t ApproximateMemoryUsage() const override; // convert SST file to a human readable form - Status DumpTable(WritableFile* out_file, - const SliceTransform* prefix_extractor = nullptr) override; + Status DumpTable(WritableFile* out_file) override; Status VerifyChecksum(TableReaderCaller caller) override; @@ -181,7 +180,7 @@ class BlockBasedTable : public TableReader { ~BlockBasedTable(); - bool TEST_filter_block_preloaded() const; + bool TEST_FilterBlockInCache() const; bool TEST_IndexBlockInCache() const; // IndexReader is the interface that provides the functionality for index @@ -241,6 +240,8 @@ class BlockBasedTable : public TableReader { class PartitionedIndexIteratorState; + template + friend class FilterBlockReaderCommon; friend class PartitionIndexReader; protected: @@ -278,21 +279,23 @@ class BlockBasedTable : public TableReader { // @param block_entry value is set to the uncompressed block if found. If // in uncompressed block cache, also sets cache_handle to reference that // block. + template Status MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, + CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the // read options allow I/O). + template Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, + CachableEntry* block_entry, + BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction = false) const; @@ -310,19 +313,6 @@ class BlockBasedTable : public TableReader { CachableEntry, MultiGetContext::MAX_BATCH_SIZE>* results, char* scratch, const UncompressionDict& uncompression_dict) const; - // For the following two functions: - // if `no_io == true`, we will not try to read filter/index from sst file - // were they not present in cache yet. - CachableEntry GetFilter( - const SliceTransform* prefix_extractor, - FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context) const; - virtual CachableEntry GetFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, - const bool is_a_filter_partition, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - const SliceTransform* prefix_extractor) const; - CachableEntry GetUncompressionDict( FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; @@ -348,12 +338,13 @@ class BlockBasedTable : public TableReader { // pointer to the block as well as its block handle. // @param uncompression_dict Data for presetting the compression library's // dictionary. + template Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, CachableEntry* block, + const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, - GetContext* get_context = nullptr) const; + GetContext* get_context) const; // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then @@ -365,11 +356,12 @@ class BlockBasedTable : public TableReader { // PutDataBlockToCache(). After the call, the object will be invalid. // @param uncompression_dict Data for presetting the compression library's // dictionary. + template Status PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - CachableEntry* cached_block, BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, + CachableEntry* cached_block, + BlockContents* raw_block_contents, CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, MemoryAllocator* memory_allocator, BlockType block_type, GetContext* get_context) const; @@ -387,13 +379,14 @@ class BlockBasedTable : public TableReader { Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, bool pin, - IndexReader** index_reader, - BlockCacheLookupContext* lookup_context); + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); bool FullFilterKeyMayMatch(const ReadOptions& read_options, FilterBlockReader* filter, const Slice& user_key, const bool no_io, const SliceTransform* prefix_extractor, + GetContext* get_context, BlockCacheLookupContext* lookup_context) const; void FullFilterKeysMayMatch(const ReadOptions& read_options, @@ -435,10 +428,9 @@ class BlockBasedTable : public TableReader { Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. - virtual FilterBlockReader* ReadFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, - const bool is_a_filter_partition, - const SliceTransform* prefix_extractor = nullptr) const; + std::unique_ptr CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); static void SetupCacheKeyPrefix(Rep* rep); @@ -516,17 +508,7 @@ struct BlockBasedTable::Rep { // Footer contains the fixed table information Footer footer; - // `filter` and `uncompression_dict` will be populated (i.e., non-nullptr) - // and used only when options.block_cache is nullptr or when - // `cache_index_and_filter_blocks == false`. Otherwise, we will get the - // filter and compression dictionary blocks via the block cache. In that case, - // `filter_handle`, and `compression_dict_handle` are used to lookup these - // meta-blocks in block cache. - // - // Note: the IndexReader object is always stored in this member variable; - // the index block itself, however, may or may not be in the block cache - // based on the settings above. We plan to change the handling of the - // filter and compression dictionary similarly. + std::unique_ptr index_reader; std::unique_ptr filter; std::unique_ptr uncompression_dict; @@ -553,13 +535,6 @@ struct BlockBasedTable::Rep { std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor; - // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is - // true or in all levels when pin_top_level_index_and_filter is set in - // combination with partitioned filters: then we do use the LRU cache, - // but we always keep the filter block's handle checked out here (=we - // don't call Release()), plus the parsed out objects the LRU cache will never - // push flush them out, hence they're pinned - CachableEntry filter_entry; std::shared_ptr fragmented_range_dels; // If global_seqno is used, all Keys in this file will have the same diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index 5b5d16ef31..b4cd6ec675 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -10,6 +10,7 @@ #pragma once #include +#include "port/likely.h" #include "rocksdb/cache.h" #include "rocksdb/cleanable.h" diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index d54de5ae1a..936281bde6 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -38,6 +38,7 @@ namespace rocksdb { const uint64_t kNotValid = ULLONG_MAX; class FilterPolicy; +class GetContext; using MultiGetRange = MultiGetContext::Range; // A FilterBlockBuilder is used to construct all of the filters for a @@ -78,16 +79,14 @@ class FilterBlockBuilder { // BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - explicit FilterBlockReader() - : whole_key_filtering_(true), size_(0), statistics_(nullptr) {} - explicit FilterBlockReader(size_t s, Statistics* stats, - bool _whole_key_filtering) - : whole_key_filtering_(_whole_key_filtering), - size_(s), - statistics_(stats) {} - virtual ~FilterBlockReader() {} + FilterBlockReader() = default; + virtual ~FilterBlockReader() = default; + + FilterBlockReader(const FilterBlockReader&) = delete; + FilterBlockReader& operator=(const FilterBlockReader&) = delete; virtual bool IsBlockBased() = 0; // If is blockbased filter + /** * If no_io is set, then it returns true if it cannot answer the query without * reading data from disk. This is used in PartitionedFilterBlockReader to @@ -102,17 +101,19 @@ class FilterBlockReader { const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) = 0; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; virtual void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) { + BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, - context)) { + get_context, lookup_context)) { range->SkipKey(iter); } } @@ -125,27 +126,26 @@ class FilterBlockReader { const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) = 0; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; virtual void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) { + BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, - block_offset, no_io, &ikey, context)) { + block_offset, no_io, &ikey, get_context, + lookup_context)) { range->SkipKey(iter); } } } virtual size_t ApproximateMemoryUsage() const = 0; - virtual size_t size() const { return size_; } - virtual Statistics* statistics() const { return statistics_; } - - bool whole_key_filtering() const { return whole_key_filtering_; } // convert this object to a human readable form virtual std::string ToString() const { @@ -153,30 +153,22 @@ class FilterBlockReader { return error_msg; } - virtual void CacheDependencies(bool /*pin*/, - const SliceTransform* /*prefix_extractor*/) {} + virtual void CacheDependencies(bool /*pin*/) {} - virtual bool RangeMayExist( - const Slice* /*iterate_upper_bound*/, const Slice& user_key, - const SliceTransform* prefix_extractor, const Comparator* /*comparator*/, - const Slice* const const_ikey_ptr, bool* filter_checked, - bool /*need_upper_bound_check*/, BlockCacheLookupContext* context) { + virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, + const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, + const Slice* const const_ikey_ptr, + bool* filter_checked, + bool /*need_upper_bound_check*/, + BlockCacheLookupContext* lookup_context) { *filter_checked = true; Slice prefix = prefix_extractor->Transform(user_key); return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr, context); + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); } - - protected: - bool whole_key_filtering_; - - private: - // No copying allowed - FilterBlockReader(const FilterBlockReader&); - void operator=(const FilterBlockReader&); - size_t size_; - Statistics* statistics_; - int level_ = -1; }; } // namespace rocksdb diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc new file mode 100644 index 0000000000..717a4ad0df --- /dev/null +++ b/table/block_based/filter_block_reader_common.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/filter_block_reader_common.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" + +namespace rocksdb { + +template +Status FilterBlockReaderCommon::ReadFilterBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) { + PERF_TIMER_GUARD(read_filter_block_nanos); + + assert(table); + assert(filter_block); + assert(filter_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + + const Status s = + table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context); + + return s; +} + +template +const SliceTransform* +FilterBlockReaderCommon::table_prefix_extractor() const { + assert(table_); + + const BlockBasedTable::Rep* const rep = table_->get_rep(); + assert(rep); + + return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr; +} + +template +bool FilterBlockReaderCommon::whole_key_filtering() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->whole_key_filtering; +} + +template +Status FilterBlockReaderCommon::GetOrReadFilterBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(filter_block); + + if (!filter_block_.IsEmpty()) { + filter_block->SetUnownedValue(filter_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options, + get_context, lookup_context, filter_block); +} + +template +size_t FilterBlockReaderCommon::ApproximateFilterBlockMemoryUsage() + const { + assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr); + return filter_block_.GetOwnValue() + ? filter_block_.GetValue()->ApproximateMemoryUsage() + : 0; +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template class FilterBlockReaderCommon; +template class FilterBlockReaderCommon; + +} // namespace rocksdb diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h new file mode 100644 index 0000000000..3698d3f1e9 --- /dev/null +++ b/table/block_based/filter_block_reader_common.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" + +namespace rocksdb { + +class BlockBasedTable; +class FilePrefetchBuffer; + +// Encapsulates common functionality for the various filter block reader +// implementations. Provides access to the filter block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +template +class FilterBlockReaderCommon : public FilterBlockReader { + public: + FilterBlockReaderCommon(const BlockBasedTable* t, + CachableEntry&& filter_block) + : table_(t), filter_block_(std::move(filter_block)) { + assert(table_); + } + + protected: + static Status ReadFilterBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block); + + const BlockBasedTable* table() const { return table_; } + const SliceTransform* table_prefix_extractor() const; + bool whole_key_filtering() const; + + Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; + + size_t ApproximateFilterBlockMemoryUsage() const; + + private: + const BlockBasedTable* table_; + CachableEntry filter_block_; +}; + +} // namespace rocksdb diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index 6d2b9d70a5..553bd37d97 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -16,6 +16,7 @@ #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" namespace rocksdb { @@ -98,59 +99,91 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, } FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - const Slice& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats) - : FilterBlockReader(contents.size(), stats, _whole_key_filtering), - prefix_extractor_(prefix_extractor), - contents_(contents) { - assert(filter_bits_reader != nullptr); - filter_bits_reader_.reset(filter_bits_reader); - if (prefix_extractor_ != nullptr) { + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (prefix_extractor) { full_length_enabled_ = - prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_); + prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); } } -FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats) - : FullFilterBlockReader(prefix_extractor, _whole_key_filtering, - contents.data, filter_bits_reader, stats) { - block_contents_ = std::move(contents); -} - bool FullFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - return MayMatch(key); + return MayMatch(key, no_io, get_context, lookup_context); +} + +std::unique_ptr FullFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new FullFilterBlockReader(table, std::move(filter_block))); } bool FullFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(block_offset == kNotValid); - return MayMatch(prefix); + return MayMatch(prefix, no_io, get_context, lookup_context); } -bool FullFilterBlockReader::MayMatch(const Slice& entry) { - if (contents_.size() != 0) { - if (filter_bits_reader_->MayMatch(entry)) { +bool FullFilterBlockReader::MayMatch( + const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + if (filter_block.GetValue()->data.size() != 0) { + assert(table()); + assert(table()->get_rep()); + + std::unique_ptr filter_bits_reader( + table()->get_rep()->filter_policy->GetFilterBitsReader( + filter_block.GetValue()->data)); + assert(filter_bits_reader != nullptr); + + if (filter_bits_reader->MayMatch(entry)) { PERF_COUNTER_ADD(bloom_sst_hit_count, 1); return true; } else { @@ -163,38 +196,58 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { void FullFilterBlockReader::KeysMayMatch( MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { // Simply return. Don't skip any key - consider all keys as likely to be // present return; } - MayMatch(range); + MayMatch(range, no_io, lookup_context); } void FullFilterBlockReader::PrefixesMayMatch( MultiGetRange* range, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - BlockCacheLookupContext* /*context*/) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); - MayMatch(range); + MayMatch(range, no_io, lookup_context); } -void FullFilterBlockReader::MayMatch(MultiGetRange* range) { - if (contents_.size() == 0) { +void FullFilterBlockReader::MayMatch( + MultiGetRange* range, bool no_io, + BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block); + if (!s.ok()) { return; } + assert(filter_block.GetValue()); + + if (filter_block.GetValue()->data.size() == 0) { + return; + } + + assert(table()); + assert(table()->get_rep()); + + std::unique_ptr filter_bits_reader( + table()->get_rep()->filter_policy->GetFilterBitsReader( + filter_block.GetValue()->data)); + assert(filter_bits_reader != nullptr); + // We need to use an array instead of autovector for may_match since // &may_match[0] doesn't work for autovector (compiler error). So // declare both keys and may_match as arrays, which is also slightly less @@ -205,7 +258,7 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) { for (auto iter = range->begin(); iter != range->end(); ++iter) { keys[num_keys++] = &iter->ukey; } - filter_bits_reader_->MayMatch(num_keys, &keys[0], &may_match[0]); + filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); int i = 0; for (auto iter = range->begin(); iter != range->end(); ++iter) { @@ -217,13 +270,11 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) { } size_t FullFilterBlockReader::ApproximateMemoryUsage() const { - size_t usage = block_contents_.usable_size(); + size_t usage = ApproximateFilterBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); - usage += malloc_usable_size(filter_bits_reader_.get()); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); - usage += sizeof(*filter_bits_reader_.get()); #endif // ROCKSDB_MALLOC_USABLE_SIZE return usage; } @@ -232,7 +283,7 @@ bool FullFilterBlockReader::RangeMayExist( const Slice* iterate_upper_bound, const Slice& user_key, const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, - bool need_upper_bound_check, BlockCacheLookupContext* context) { + bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) { if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { *filter_checked = false; return true; @@ -245,22 +296,23 @@ bool FullFilterBlockReader::RangeMayExist( } else { *filter_checked = true; return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr, context); + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); } } bool FullFilterBlockReader::IsFilterCompatible( const Slice* iterate_upper_bound, const Slice& prefix, - const Comparator* comparator) { + const Comparator* comparator) const { // Try to reuse the bloom filter in the SST table if prefix_extractor in // mutable_cf_options has changed. If range [user_key, upper_bound) all // share the same prefix then we may still be able to use the bloom filter. - if (iterate_upper_bound != nullptr && prefix_extractor_) { - if (!prefix_extractor_->InDomain(*iterate_upper_bound)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (iterate_upper_bound != nullptr && prefix_extractor) { + if (!prefix_extractor->InDomain(*iterate_upper_bound)) { return false; } - Slice upper_bound_xform = - prefix_extractor_->Transform(*iterate_upper_bound); + Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); // first check if user_key and upper_bound all share the same prefix if (!comparator->Equal(prefix, upper_bound_xform)) { // second check if user_key's prefix is the immediate predecessor of diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 99e5299b34..08a41706e6 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -15,7 +15,8 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "table/block_based/filter_block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/format.h" #include "util/hash.h" namespace rocksdb { @@ -78,71 +79,58 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { // A FilterBlockReader is used to parse filter from SST table. // KeyMayMatch and PrefixMayMatch would trigger filter checking -class FullFilterBlockReader : public FilterBlockReader { +class FullFilterBlockReader : public FilterBlockReaderCommon { public: - // REQUIRES: "contents" and filter_bits_reader must stay live - // while *this is live. - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - const Slice& contents, - FilterBitsReader* filter_bits_reader, - Statistics* statistics); - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - BlockContents&& contents, - FilterBitsReader* filter_bits_reader, - Statistics* statistics); + FullFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); - // bits_reader is created in filter_policy, it should be passed in here - // directly. and be deleted here - ~FullFilterBlockReader() override {} + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return false; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) override; + BlockCacheLookupContext* lookup_context) override; void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - BlockCacheLookupContext* context) override; + BlockCacheLookupContext* lookup_context) override; size_t ApproximateMemoryUsage() const override; bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, bool need_upper_bound_check, - BlockCacheLookupContext* context) override; + BlockCacheLookupContext* lookup_context) override; + + private: + bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + void MayMatch(MultiGetRange* range, bool no_io, + BlockCacheLookupContext* lookup_context) const; + bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const; private: - const SliceTransform* prefix_extractor_; - Slice contents_; - std::unique_ptr filter_bits_reader_; - BlockContents block_contents_; bool full_length_enabled_; size_t prefix_extractor_full_length_; - - // No copying allowed - FullFilterBlockReader(const FullFilterBlockReader&); - bool MayMatch(const Slice& entry); - void MayMatch(MultiGetRange* range); - void operator=(const FullFilterBlockReader&); - bool IsFilterCompatible(const Slice* iterate_upper_bound, - const Slice& prefix, const Comparator* comparator); - }; } // namespace rocksdb diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 57ff158c5c..e8fcce07d7 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -6,6 +6,7 @@ #include "table/block_based/full_filter_block.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "table/full_filter_bits_builder.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -40,6 +41,15 @@ class TestFilterBitsBuilder : public FilterBitsBuilder { std::vector hash_entries_; }; +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) { + // Initialize what Open normally does as much as necessary for the test + rep->cache_key_prefix_size = 10; + } +}; + class TestFilterBitsReader : public FilterBitsReader { public: explicit TestFilterBitsReader(const Slice& contents) @@ -95,26 +105,46 @@ class TestHashFilter : public FilterPolicy { class PluginFullFilterBlockTest : public testing::Test { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; - PluginFullFilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); + PluginFullFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; + table_options_.filter_policy.reset(new TestHashFilter); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); } }; TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { FullFilterBlockBuilder builder( nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { @@ -125,57 +155,90 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { builder.Add("box"); builder.Add("box"); builder.Add("hello"); - Slice block = builder.Finish(); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + Slice slice = builder.Finish(); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } class FullFilterBlockTest : public testing::Test { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; - FullFilterBlockTest() { + FullFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.no_block_cache = true; table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); - } - ~FullFilterBlockTest() override {} + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table))); + } }; TEST_F(FullFilterBlockTest, EmptyBuilder) { FullFilterBlockBuilder builder( nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } TEST_F(FullFilterBlockTest, DuplicateEntries) { @@ -221,31 +284,46 @@ TEST_F(FullFilterBlockTest, SingleChunk) { builder.Add("box"); builder.Add("hello"); ASSERT_EQ(5, builder.NumAdded()); - Slice block = builder.Finish(); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "box", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); - ASSERT_TRUE(reader.KeyMayMatch( - "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + Slice slice = builder.Finish(); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); ASSERT_TRUE(!reader.KeyMayMatch( "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, - /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*context=*/nullptr)); + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index dcd985152b..ae57e85dca 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -119,113 +119,77 @@ Slice PartitionedFilterBlockBuilder::Finish( } PartitionedFilterBlockReader::PartitionedFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, - Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq, - const bool index_value_is_full) - : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), - prefix_extractor_(prefix_extractor), - comparator_(comparator), - table_(table), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - idx_on_fltr_blk_.reset(new Block(std::move(contents), - kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, stats)); -} + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) {} -PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { - // TODO(myabandeh): if instead of filter object we store only the blocks in - // block cache, then we don't have to manually earse them from block cache - // here. - auto block_cache = table_->rep_->table_options.block_cache.get(); - if (UNLIKELY(block_cache == nullptr)) { - return; - } - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - IndexBlockIter biter; - BlockHandle handle; - Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIndexIterator( - &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - /* have_first_key */ false, index_key_includes_seq_, - index_value_is_full_); - biter.SeekToFirst(); - for (; biter.Valid(); biter.Next()) { - handle = biter.value().handle; - auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, - table_->rep_->cache_key_prefix_size, - handle, cache_key); - block_cache->Erase(key); +std::unique_ptr PartitionedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + nullptr /* get_context */, lookup_context, + &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } } + + return std::unique_ptr( + new PartitionedFilterBlockReader(table, std::move(filter_block))); } bool PartitionedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) { + GetContext* get_context, BlockCacheLookupContext* lookup_context) { assert(const_ikey_ptr != nullptr); assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { - return true; - } - auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); - if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range - return false; - } - auto filter_partition = - GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io, - prefix_extractor, context); - if (UNLIKELY(!filter_partition.GetValue())) { - return true; - } - return filter_partition.GetValue()->KeyMayMatch( - key, prefix_extractor, block_offset, no_io, /*const_ikey_ptr=*/nullptr, - context); + + return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::KeyMayMatch); } bool PartitionedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) { + GetContext* get_context, BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(const_ikey_ptr != nullptr); assert(block_offset == kNotValid); - if (!prefix_extractor_ && !prefix_extractor) { + if (!table_prefix_extractor() && !prefix_extractor) { return true; } - if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { - return true; - } - auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); - if (UNLIKELY(filter_handle.size() == 0)) { // prefix is out of range - return false; - } - auto filter_partition = - GetFilterPartition(/*prefetch_buffer=*/nullptr, filter_handle, no_io, - prefix_extractor, context); - if (UNLIKELY(!filter_partition.GetValue())) { - return true; - } - return filter_partition.GetValue()->PrefixMayMatch( - prefix, prefix_extractor, kNotValid, no_io, /*const_ikey_ptr=*/nullptr, - context); + + return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::PrefixMayMatch); } BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( - const Slice& entry) { + const CachableEntry& filter_block, const Slice& entry) const { IndexBlockIter iter; + const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIndexIterator( - &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - /* have_first_key */ false, index_key_includes_seq_, - index_value_is_full_); + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &iter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { return BlockHandle(0, 0); @@ -235,39 +199,78 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( return fltr_blk_handle; } -CachableEntry -PartitionedFilterBlockReader::GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, - const bool no_io, const SliceTransform* prefix_extractor, - BlockCacheLookupContext* context) { - const bool is_a_filter_partition = true; - auto block_cache = table_->rep_->table_options.block_cache.get(); - if (LIKELY(block_cache != nullptr)) { - if (filter_map_.size() != 0) { - auto iter = filter_map_.find(fltr_blk_handle.offset()); - // This is a possible scenario since block cache might not have had space - // for the partition - if (iter != filter_map_.end()) { - return {iter->second.GetValue(), nullptr /* cache */, - nullptr /* cache_handle */, false /* own_value */}; - } +Status PartitionedFilterBlockReader::GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(table()); + assert(filter_block); + assert(filter_block->IsEmpty()); + + if (!filter_map_.empty()) { + auto iter = filter_map_.find(fltr_blk_handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (iter != filter_map_.end()) { + filter_block->SetUnownedValue(iter->second.GetValue()); + return Status::OK(); } - return table_->GetFilter(/*prefetch_buffer=*/nullptr, fltr_blk_handle, - is_a_filter_partition, no_io, - /*get_context=*/nullptr, context, - prefix_extractor); - } else { - auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, - is_a_filter_partition, prefix_extractor); - return {filter, nullptr /* cache */, nullptr /* cache_handle */, - true /* own_value */}; } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + const Status s = + table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context); + + return s; +} + +bool PartitionedFilterBlockReader::MayMatch( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const { + CachableEntry filter_block; + Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return true; + } + + auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + + CachableEntry filter_partition_block; + s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, + no_io, get_context, lookup_context, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + return (filter_partition.*filter_function)( + slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, + lookup_context); } size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { - size_t usage = idx_on_fltr_blk_->usable_size(); + size_t usage = ApproximateFilterBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE @@ -276,16 +279,36 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -void PartitionedFilterBlockReader::CacheDependencies( - bool pin, const SliceTransform* prefix_extractor) { - // Before read partitions, prefetch them to avoid lots of IOs +void PartitionedFilterBlockReader::CacheDependencies(bool pin) { + assert(table()); + + const BlockBasedTable::Rep* const rep = table()->get_rep(); + assert(rep); + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + + CachableEntry filter_block; + + Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &filter_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return; + } + + // Before read partitions, prefetch them to avoid lots of IOs + assert(filter_block.GetValue()); + IndexBlockIter biter; + const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIndexIterator( - &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - /* have_first_key */ false, index_key_includes_seq_, - index_value_is_full_); + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &biter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -298,27 +321,55 @@ void PartitionedFilterBlockReader::CacheDependencies( uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - auto& file = table_->rep_->file; + prefetch_buffer.reset(new FilePrefetchBuffer()); - Status s; - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, static_cast(prefetch_len)); // After prefetch, read the partitions one by one - biter.SeekToFirst(); - for (; biter.Valid(); biter.Next()) { + ReadOptions read_options; + for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; - const bool no_io = true; - const bool is_a_filter_partition = true; - auto filter = table_->GetFilter( - prefetch_buffer.get(), handle, is_a_filter_partition, !no_io, - /*get_context=*/nullptr, &lookup_context, prefix_extractor); - if (LIKELY(filter.IsCached())) { - if (pin) { - filter_map_[handle.offset()] = std::move(filter); + + CachableEntry block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), read_options, handle, + UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter, + nullptr /* get_context */, &lookup_context, nullptr /* contents */); + + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + filter_map_[handle.offset()] = std::move(block); + } } } } } +const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() + const { + assert(table()); + assert(table()->get_rep()); + + return &table()->get_rep()->internal_comparator; +} + +bool PartitionedFilterBlockReader::index_key_includes_seq() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_key_includes_seq; +} + +bool PartitionedFilterBlockReader::index_value_is_full() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_value_is_full; +} + } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 4b0fb523d0..b73ae3baa7 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -14,8 +14,7 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/block_based/block.h" -#include "table/block_based/block_based_table_reader.h" -#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block_reader_common.h" #include "table/block_based/full_filter_block.h" #include "util/autovector.h" @@ -69,44 +68,57 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { BlockHandle last_encoded_handle_; }; -class PartitionedFilterBlockReader : public FilterBlockReader { +class PartitionedFilterBlockReader : public FilterBlockReaderCommon { public: - explicit PartitionedFilterBlockReader( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq, - const bool index_value_is_full); - ~PartitionedFilterBlockReader() override; + PartitionedFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return false; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, - BlockCacheLookupContext* context) override; + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + size_t ApproximateMemoryUsage() const override; private: - BlockHandle GetFilterPartitionHandle(const Slice& entry); - CachableEntry GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, - const bool no_io, const SliceTransform* prefix_extractor, - BlockCacheLookupContext* context); - void CacheDependencies(bool bin, - const SliceTransform* prefix_extractor) override; + BlockHandle GetFilterPartitionHandle(const CachableEntry& filter_block, + const Slice& entry) const; + Status GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; - const SliceTransform* prefix_extractor_; - std::unique_ptr idx_on_fltr_blk_; - const InternalKeyComparator comparator_; - const BlockBasedTable* table_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; - std::unordered_map> filter_map_; + using FilterFunction = bool (FullFilterBlockReader::*)( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context); + bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const; + void CacheDependencies(bool pin) override; + + const InternalKeyComparator* internal_comparator() const; + bool index_key_includes_seq() const; + bool index_value_is_full() const; + + protected: + std::unordered_map> filter_map_; }; } // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 34ecfa4ac6..5e9e467723 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -7,6 +7,7 @@ #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/partitioned_filter_block.h" #include "table/full_filter_bits_builder.h" @@ -23,34 +24,29 @@ std::map slices; class MockedBlockBasedTable : public BlockBasedTable { public: - explicit MockedBlockBasedTable(Rep* rep) + MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib) : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { // Initialize what Open normally does as much as necessary for the test rep->cache_key_prefix_size = 10; + rep->index_key_includes_seq = pib->seperator_is_key_plus_seq(); + rep->index_value_is_full = !pib->get_use_value_delta_encoding(); } +}; - CachableEntry GetFilter( - FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, - const bool /* unused */, bool /* unused */, GetContext* /* unused */, - BlockCacheLookupContext* /*context*/, - const SliceTransform* prefix_extractor) const override { - Slice slice = slices[filter_blk_handle.offset()]; - auto obj = new FullFilterBlockReader( - prefix_extractor, true, BlockContents(slice), - rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return {obj, nullptr /* cache */, nullptr /* cache_handle */, - true /* own_value */}; - } +class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { + public: + MyPartitionedFilterBlockReader(BlockBasedTable* t, + CachableEntry&& filter_block) + : PartitionedFilterBlockReader(t, std::move(filter_block)) { + for (const auto& pair : slices) { + const uint64_t offset = pair.first; + const Slice& slice = pair.second; - FilterBlockReader* ReadFilter( - FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, - const bool /* unused */, - const SliceTransform* prefix_extractor) const override { - Slice slice = slices[filter_blk_handle.offset()]; - auto obj = new FullFilterBlockReader( - prefix_extractor, true, BlockContents(slice), - rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return obj; + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, + nullptr /* cache_handle */, true /* own_value */); + filter_map_[offset] = std::move(block); + } } }; @@ -58,10 +54,18 @@ class PartitionedFilterBlockTest : public testing::Test, virtual public ::testing::WithParamInterface { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; - InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator()); + InternalKeyComparator icomp_; + std::unique_ptr table_; + std::shared_ptr cache_; - PartitionedFilterBlockTest() { + PartitionedFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); table_options_.no_block_cache = true; // Otherwise BlockBasedTable::Close // will access variable that are not @@ -70,7 +74,6 @@ class PartitionedFilterBlockTest table_options_.index_block_restart_interval = 3; } - std::shared_ptr cache_; ~PartitionedFilterBlockTest() override {} const std::string keys[4] = {"afoo", "bar", "box", "hello"}; @@ -110,7 +113,7 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* NewIndexBuilder() { const bool kValueDeltaEncoded = true; return PartitionedIndexBuilder::CreateIndexBuilder( - &icomp, !kValueDeltaEncoded, table_options_); + &icomp_, !kValueDeltaEncoded, table_options_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -131,11 +134,8 @@ class PartitionedFilterBlockTest p_index_builder, partition_size); } - std::unique_ptr table; - PartitionedFilterBlockReader* NewReader( - PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, - const SliceTransform* prefix_extractor) { + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) { BlockHandle bh; Status status; Slice slice; @@ -143,19 +143,21 @@ class PartitionedFilterBlockTest slice = builder->Finish(bh, &status); bh = Write(slice); } while (status.IsIncomplete()); - const Options options; - const ImmutableCFOptions ioptions(options); - const MutableCFOptions moptions(options); - const EnvOptions env_options; - const bool kSkipFilters = true; - const bool kImmortal = true; - table.reset(new MockedBlockBasedTable( - new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp, - !kSkipFilters, 0, !kImmortal))); - auto reader = new PartitionedFilterBlockReader( - prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp, - table.get(), pib->seperator_is_key_plus_seq(), - !pib->get_use_value_delta_encoding()); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockedBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table), + pib)); + BlockContents contents(slice); + CachableEntry block( + new Block(std::move(contents), kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, nullptr), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + auto reader = + new MyPartitionedFilterBlockReader(table_.get(), std::move(block)); return reader; } @@ -163,36 +165,37 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* pib, bool empty = false, const SliceTransform* prefix_extractor = nullptr) { std::unique_ptr reader( - NewReader(builder, pib, prefix_extractor)); + NewReader(builder, pib)); // Querying added keys const bool no_io = true; for (auto key : keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, - &ikey_slice, /*context=*/nullptr)); + &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } { // querying a key twice auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid, - !no_io, &ikey_slice, - /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } // querying missing keys for (auto key : missing_keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { - ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice, - /*context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } else { // assuming a good hash function - ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice, - /*context=*/nullptr)); + ASSERT_FALSE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } } } @@ -336,13 +339,14 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { builder->Add(pkeys[2]); CutABlock(pib.get(), pkeys[2]); std::unique_ptr reader( - NewReader(builder.get(), pib.get(), prefix_extractor.get())); + NewReader(builder.get(), pib.get())); for (auto key : pkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->PrefixMayMatch( prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, - /*no_io=*/false, &ikey_slice, /*context=*/nullptr)); + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } } diff --git a/table/table_reader.h b/table/table_reader.h index 1c879cb1f8..72d11a7bd2 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -116,8 +116,7 @@ class TableReader { } // convert db file to a human readable form - virtual Status DumpTable(WritableFile* /*out_file*/, - const SliceTransform* /*prefix_extractor*/) { + virtual Status DumpTable(WritableFile* /*out_file*/) { return Status::NotSupported("DumpTable() not supported"); } diff --git a/table/table_test.cc b/table/table_test.cc index c3a1f82ed3..c54933b781 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2296,7 +2296,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { // preloading filter/index blocks is enabled. auto reader = dynamic_cast(c.GetTableReader()); - ASSERT_TRUE(reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); ASSERT_FALSE(reader->TEST_IndexBlockInCache()); { @@ -2343,7 +2343,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. auto* reader = dynamic_cast(c.GetTableReader()); - ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); ASSERT_TRUE(reader->TEST_IndexBlockInCache()); // -- PART 1: Open with regular block cache. @@ -2476,7 +2476,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { MutableCFOptions moptions4(options); ASSERT_OK(c3.Reopen(ioptions4, moptions4)); reader = dynamic_cast(c3.GetTableReader()); - ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 260d15f303..44a733b57c 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -150,8 +150,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) { std::unique_ptr out_file; Env* env = Env::Default(); env->NewWritableFile(out_filename, &out_file, soptions_); - Status s = table_reader_->DumpTable(out_file.get(), - moptions_.prefix_extractor.get()); + Status s = table_reader_->DumpTable(out_file.get()); out_file->Close(); return s; }