diff --git a/HISTORY.md b/HISTORY.md index 7272fedb4d..f9c381a952 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,9 @@ * For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed. * With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents. +### New Features +* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. + ## 5.14.0 (5/16/2018) ### Public API Change * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 93a0e0f7c6..ef4eb0f0d9 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -177,17 +177,16 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { std::replace(tp_string.begin(), tp_string.end(), ';', ' '); std::replace(tp_string.begin(), tp_string.end(), '=', ' '); ResetTableProperties(tp); - sscanf(tp_string.c_str(), "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64 " raw average key size %lf " " raw value size %" SCNu64 " raw average value size %lf " - " data block size %" SCNu64 " index block size %" SCNu64 - " filter block size %" SCNu64, + " data block size %" SCNu64 " index block size (user-key? %" SCNu64 + ") %" SCNu64 " filter block size %" SCNu64, &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_size, &tp->filter_size); + &tp->index_key_is_user_key, &tp->index_size, &tp->filter_size); } void VerifySimilar(uint64_t a, uint64_t b, double bias) { @@ -224,7 +223,8 @@ void GetExpectedTableProperties(TableProperties* expected_tp, const int kKeySize, const int kValueSize, const int kKeysPerTable, const int kTableCount, const int kBloomBitsPerKey, - const size_t kBlockSize) { + const size_t kBlockSize, + const bool index_key_is_user_key) { const int kKeyCount = kTableCount * kKeysPerTable; const int kAvgSuccessorSize = kKeySize / 5; const int kEncodingSavePerKey = kKeySize / 4; @@ -238,7 +238,8 @@ void GetExpectedTableProperties(TableProperties* expected_tp, expected_tp->data_size = kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); expected_tp->index_size = - expected_tp->num_data_blocks * (kAvgSuccessorSize + 8); + expected_tp->num_data_blocks * + (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8)); expected_tp->filter_size = kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); } @@ -315,14 +316,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { } std::string property; db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + bool index_key_is_user_key = output_tp.index_key_is_user_key > 0; TableProperties expected_tp; GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, kKeysPerTable, kTableCount, kBloomBitsPerKey, - table_options.block_size); - - TableProperties output_tp; - ParseTablePropertiesString(property, &output_tp); + table_options.block_size, index_key_is_user_key); VerifyTableProperties(expected_tp, output_tp); } @@ -489,6 +490,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { } db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); ParseTablePropertiesString(tp_string, &tp); + bool index_key_is_user_key = tp.index_key_is_user_key > 0; ASSERT_EQ(sum_tp.data_size, tp.data_size); ASSERT_EQ(sum_tp.index_size, tp.index_size); ASSERT_EQ(sum_tp.filter_size, tp.filter_size); @@ -497,9 +499,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); ASSERT_EQ(sum_tp.num_entries, tp.num_entries); if (table > 3) { - GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, table, kBloomBitsPerKey, - table_options.block_size); + GetExpectedTableProperties( + &expected_tp, kKeySize, kValueSize, kKeysPerTable, table, + kBloomBitsPerKey, table_options.block_size, index_key_is_user_key); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index e74f4669f0..6ee1c6c388 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -214,8 +214,11 @@ struct BlockBasedTableOptions { // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you // don't plan to run RocksDB before version 3.10, you should probably use // this. - // This option only affects newly written tables. When reading existing tables, - // the information about version is read from the footer. + // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we + // encode the keys in index blocks. If you don't plan to run RocksDB before + // version 5.15, you should probably use this. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. uint32_t format_version = 2; // Store index blocks on disk in compressed format. Changing this option to diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index 4a525591bc..18165922a4 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -33,6 +33,7 @@ struct TablePropertiesNames { static const std::string kIndexSize; static const std::string kIndexPartitions; static const std::string kTopLevelIndexSize; + static const std::string kIndexKeyIsUserKey; static const std::string kFilterSize; static const std::string kRawKeySize; static const std::string kRawValueSize; @@ -134,6 +135,9 @@ struct TableProperties { uint64_t index_partitions = 0; // Size of the top-level index if kTwoLevelIndexSearch is used uint64_t top_level_index_size = 0; + // Whether the index key is user key. Otherwise it includes 8 byte of sequence + // number added by internal key format. + uint64_t index_key_is_user_key = 0; // the size of filter block. uint64_t filter_size = 0; // total raw key size diff --git a/table/block.cc b/table/block.cc index d13c6eb212..15c5cd6092 100644 --- a/table/block.cc +++ b/table/block.cc @@ -87,7 +87,11 @@ void BlockIter::Prev() { const Slice current_key(key_ptr, current_prev_entry.key_size); current_ = current_prev_entry.offset; - key_.SetInternalKey(current_key, false /* copy */); + if (key_includes_seq_) { + key_.SetInternalKey(current_key, false /* copy */); + } else { + key_.SetUserKey(current_key, false /* copy */); + } value_ = current_prev_entry.value; return; @@ -136,6 +140,10 @@ void BlockIter::Prev() { } void BlockIter::Seek(const Slice& target) { + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; @@ -145,7 +153,7 @@ void BlockIter::Seek(const Slice& target) { if (prefix_index_) { ok = PrefixSeek(target, &index); } else { - ok = BinarySeek(target, 0, num_restarts_ - 1, &index); + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index); } if (!ok) { @@ -155,7 +163,7 @@ void BlockIter::Seek(const Slice& target) { // Linear search (within restart block) for first key >= target while (true) { - if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) { + if (!ParseNextKey() || Compare(key_, seek_key) >= 0) { return; } } @@ -163,24 +171,28 @@ void BlockIter::Seek(const Slice& target) { void BlockIter::SeekForPrev(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; - bool ok = BinarySeek(target, 0, num_restarts_ - 1, &index); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index); if (!ok) { return; } SeekToRestartPoint(index); - // Linear search (within restart block) for first key >= target + // Linear search (within restart block) for first key >= seek_key - while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) { + while (ParseNextKey() && Compare(key_, seek_key) < 0) { } if (!Valid()) { SeekToLast(); } else { - while (Valid() && Compare(key_.GetInternalKey(), target) > 0) { + while (Valid() && Compare(key_, seek_key) > 0) { Prev(); } } @@ -233,7 +245,11 @@ bool BlockIter::ParseNextKey() { if (shared == 0) { // If this key dont share any bytes with prev key then we dont need // to decode it and can use it's address in the block directly. - key_.SetInternalKey(Slice(p, non_shared), false /* copy */); + if (key_includes_seq_) { + key_.SetInternalKey(Slice(p, non_shared), false /* copy */); + } else { + key_.SetUserKey(Slice(p, non_shared), false /* copy */); + } key_pinned_ = true; } else { // This key share `shared` bytes with prev key, we need to decode it @@ -380,6 +396,10 @@ bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { assert(prefix_index_); + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } uint32_t* block_ids = nullptr; uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); @@ -387,7 +407,7 @@ bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { current_ = restarts_; return false; } else { - return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); + return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index); } } @@ -422,8 +442,9 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, } } -BlockIter* Block::NewIterator(const Comparator* cmp, BlockIter* iter, - bool total_order_seek, Statistics* stats) { +BlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, + BlockIter* iter, bool total_order_seek, + Statistics* stats, bool key_includes_seq) { BlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -441,9 +462,9 @@ BlockIter* Block::NewIterator(const Comparator* cmp, BlockIter* iter, } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index_.get(); - ret_iter->Initialize(cmp, data_, restart_offset_, num_restarts_, + ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, prefix_index_ptr, global_seqno_, - read_amp_bitmap_.get()); + read_amp_bitmap_.get(), key_includes_seq); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { diff --git a/table/block.h b/table/block.h index 0c0371b655..e2f6e48d0b 100644 --- a/table/block.h +++ b/table/block.h @@ -162,6 +162,9 @@ class Block { // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. // + // If comparator is InternalKeyComparator, user_comparator is its user + // comparator; they are equal otherwise. + // // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // @@ -169,9 +172,11 @@ class Block { // This option only applies for index block. For data block, hash_index_ // and prefix_index_ are null, so this option does not matter. BlockIter* NewIterator(const Comparator* comparator, + const Comparator* user_comparator, BlockIter* iter = nullptr, bool total_order_seek = true, - Statistics* stats = nullptr); + Statistics* stats = nullptr, + bool key_includes_seq = true); void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index); // Report an approximation of how much memory has been used. @@ -203,6 +208,7 @@ class BlockIter final : public InternalIterator { // and status() is OK. BlockIter() : comparator_(nullptr), + user_comparator_(nullptr), data_(nullptr), restarts_(0), num_restarts_(0), @@ -211,26 +217,30 @@ class BlockIter final : public InternalIterator { status_(Status::OK()), prefix_index_(nullptr), key_pinned_(false), + key_includes_seq_(true), global_seqno_(kDisableGlobalSequenceNumber), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} - BlockIter(const Comparator* comparator, const char* data, uint32_t restarts, - uint32_t num_restarts, BlockPrefixIndex* prefix_index, - SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap) + BlockIter(const Comparator* comparator, const Comparator* user_comparator, + const char* data, uint32_t restarts, uint32_t num_restarts, + BlockPrefixIndex* prefix_index, SequenceNumber global_seqno, + BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) : BlockIter() { - Initialize(comparator, data, restarts, num_restarts, prefix_index, - global_seqno, read_amp_bitmap); + Initialize(comparator, user_comparator, data, restarts, num_restarts, + prefix_index, global_seqno, read_amp_bitmap, key_includes_seq); } - void Initialize(const Comparator* comparator, const char* data, + void Initialize(const Comparator* comparator, + const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap) { + BlockReadAmpBitmap* read_amp_bitmap, bool key_includes_seq) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid comparator_ = comparator; + user_comparator_ = user_comparator; data_ = data; restarts_ = restarts; num_restarts_ = num_restarts; @@ -240,6 +250,7 @@ class BlockIter final : public InternalIterator { global_seqno_ = global_seqno; read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; + key_includes_seq_ = key_includes_seq; } // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do @@ -263,7 +274,7 @@ class BlockIter final : public InternalIterator { virtual Status status() const override { return status_; } virtual Slice key() const override { assert(Valid()); - return key_.GetInternalKey(); + return key_includes_seq_ ? key_.GetInternalKey() : key_.GetUserKey(); } virtual Slice value() const override { assert(Valid()); @@ -312,7 +323,11 @@ class BlockIter final : public InternalIterator { } private: + // Note: The type could be changed to InternalKeyComparator but we see a weird + // performance drop by that. const Comparator* comparator_; + // Same as comparator_ if comparator_ is not InernalKeyComparator + const Comparator* user_comparator_; const char* data_; // underlying block contents uint32_t restarts_; // Offset of restart array (list of fixed32) uint32_t num_restarts_; // Number of uint32_t entries in restart array @@ -325,8 +340,11 @@ class BlockIter final : public InternalIterator { Status status_; BlockPrefixIndex* prefix_index_; bool key_pinned_; + // Key is in InternalKey format + bool key_includes_seq_; SequenceNumber global_seqno_; + public: // read-amp bitmap BlockReadAmpBitmap* read_amp_bitmap_; // last `current_` value we report to read-amp bitmp @@ -357,7 +375,19 @@ class BlockIter final : public InternalIterator { int32_t prev_entries_idx_ = -1; inline int Compare(const Slice& a, const Slice& b) const { - return comparator_->Compare(a, b); + if (key_includes_seq_) { + return comparator_->Compare(a, b); + } else { + return user_comparator_->Compare(a, b); + } + } + + inline int Compare(const IterKey& ikey, const Slice& b) const { + if (key_includes_seq_) { + return comparator_->Compare(ikey.GetInternalKey(), b); + } else { + return user_comparator_->Compare(ikey.GetUserKey(), b); + } } // Return the offset in data_ just past the end of the current entry. diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 8c4ced1dd0..a742f63277 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -763,6 +763,8 @@ Status BlockBasedTableBuilder::Finish() { r->props.top_level_index_size = r->p_index_builder_->EstimateTopLevelIndexSize(r->offset); } + r->props.index_key_is_user_key = + !r->index_builder->seperator_is_key_plus_seq(); r->props.creation_time = r->creation_time; r->props.oldest_key_time = r->oldest_key_time; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index e3783fe177..1fdcda6fbd 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -212,7 +212,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable { const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const int level) { + const int level, const bool index_key_includes_seq) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -221,9 +221,9 @@ class PartitionIndexReader : public IndexReader, public Cleanable { kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */); if (s.ok()) { - *index_reader = - new PartitionIndexReader(table, icomparator, std::move(index_block), - ioptions.statistics, level); + *index_reader = new PartitionIndexReader( + table, icomparator, std::move(index_block), ioptions.statistics, + level, index_key_includes_seq); } return s; @@ -237,15 +237,19 @@ class PartitionIndexReader : public IndexReader, public Cleanable { if (!partition_map_.empty()) { return NewTwoLevelIterator( new BlockBasedTable::PartitionedIndexIteratorState( - table_, partition_map_.size() ? &partition_map_ : nullptr), - index_block_->NewIterator(icomparator_, nullptr, true)); + table_, &partition_map_, index_key_includes_seq_), + index_block_->NewIterator( + icomparator_, icomparator_->user_comparator(), nullptr, true)); } else { auto ro = ReadOptions(); ro.fill_cache = fill_cache; + bool kIsIndex = true; return new BlockBasedTableIterator( table_, ro, *icomparator_, - index_block_->NewIterator(icomparator_, nullptr, true), false, - /* prefix_extractor */ nullptr); + index_block_->NewIterator( + icomparator_, icomparator_->user_comparator(), nullptr, true), + false, + /* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_); } // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes @@ -258,7 +262,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { auto rep = table_->rep_; BlockIter biter; BlockHandle handle; - index_block_->NewIterator(icomparator_, &biter, true); + index_block_->NewIterator(icomparator_, icomparator_->user_comparator(), + &biter, true); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -347,16 +352,18 @@ class PartitionIndexReader : public IndexReader, public Cleanable { PartitionIndexReader(BlockBasedTable* table, const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const int /*level*/) + const int /*level*/, const bool index_key_includes_seq) : IndexReader(icomparator, stats), table_(table), - index_block_(std::move(index_block)) { + index_block_(std::move(index_block)), + index_key_includes_seq_(index_key_includes_seq) { assert(index_block_ != nullptr); } BlockBasedTable* table_; std::unique_ptr index_block_; std::unordered_map> partition_map_; + const bool index_key_includes_seq_; }; // Index that allows binary search lookup for the first key of each block. @@ -374,7 +381,8 @@ class BinarySearchIndexReader : public IndexReader { const ImmutableCFOptions& ioptions, const InternalKeyComparator* icomparator, IndexReader** index_reader, - const PersistentCacheOptions& cache_options) { + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -384,7 +392,8 @@ class BinarySearchIndexReader : public IndexReader { if (s.ok()) { *index_reader = new BinarySearchIndexReader( - icomparator, std::move(index_block), ioptions.statistics); + icomparator, std::move(index_block), ioptions.statistics, + index_key_includes_seq); } return s; @@ -393,7 +402,9 @@ class BinarySearchIndexReader : public IndexReader { virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, bool /*dont_care*/ = true, bool /*dont_care*/ = true) override { - return index_block_->NewIterator(icomparator_, iter, true); + return index_block_->NewIterator(icomparator_, + icomparator_->user_comparator(), iter, + true, nullptr, index_key_includes_seq_); } virtual size_t size() const override { return index_block_->size(); } @@ -409,11 +420,14 @@ class BinarySearchIndexReader : public IndexReader { private: BinarySearchIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, - Statistics* stats) - : IndexReader(icomparator, stats), index_block_(std::move(index_block)) { + Statistics* stats, const bool index_key_includes_seq) + : IndexReader(icomparator, stats), + index_block_(std::move(index_block)), + index_key_includes_seq_(index_key_includes_seq) { assert(index_block_ != nullptr); } std::unique_ptr index_block_; + const bool index_key_includes_seq_; }; // Index that leverages an internal hash table to quicken the lookup for a given @@ -429,7 +443,8 @@ class HashIndexReader : public IndexReader { InternalIterator* meta_index_iter, IndexReader** index_reader, bool /*hash_index_allow_collision*/, - const PersistentCacheOptions& cache_options) { + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -447,7 +462,7 @@ class HashIndexReader : public IndexReader { auto new_index_reader = new HashIndexReader(icomparator, std::move(index_block), - ioptions.statistics); + ioptions.statistics, index_key_includes_seq); *index_reader = new_index_reader; // Get prefixes block @@ -504,7 +519,9 @@ class HashIndexReader : public IndexReader { virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, bool total_order_seek = true, bool /*dont_care*/ = true) override { - return index_block_->NewIterator(icomparator_, iter, total_order_seek); + return index_block_->NewIterator( + icomparator_, icomparator_->user_comparator(), iter, total_order_seek, + nullptr, index_key_includes_seq_); } virtual size_t size() const override { return index_block_->size(); } @@ -520,8 +537,11 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, Statistics* stats) - : IndexReader(icomparator, stats), index_block_(std::move(index_block)) { + std::unique_ptr&& index_block, Statistics* stats, + const bool index_key_includes_seq) + : IndexReader(icomparator, stats), + index_block_(std::move(index_block)), + index_key_includes_seq_(index_key_includes_seq) { assert(index_block_ != nullptr); } @@ -530,6 +550,7 @@ class HashIndexReader : public IndexReader { std::unique_ptr index_block_; BlockContents prefixes_contents_; + const bool index_key_includes_seq_; }; // Helper function to setup the cache key's prefix for the Table. @@ -1026,7 +1047,8 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep, *meta_block = std::move(meta); // meta block uses bytewise comparator. - iter->reset(meta_block->get()->NewIterator(BytewiseComparator())); + iter->reset(meta_block->get()->NewIterator(BytewiseComparator(), + BytewiseComparator())); return Status::OK(); } @@ -1502,14 +1524,15 @@ InternalIterator* BlockBasedTable::NewIndexIterator( BlockIter* BlockBasedTable::NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const Slice& index_value, - BlockIter* input_iter, bool is_index, GetContext* get_context) { + BlockIter* input_iter, bool is_index, bool key_includes_seq, + GetContext* get_context) { BlockHandle handle; Slice input = index_value; // We intentionally allow extra stuff in index_value so that we // can add more features in the future. Status s = handle.DecodeFrom(&input); return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, - get_context, s); + key_includes_seq, get_context, s); } // Convert an index iterator value (i.e., an encoded BlockHandle) @@ -1518,7 +1541,8 @@ BlockIter* BlockBasedTable::NewDataBlockIterator( // If input_iter is not null, update this iter and return it BlockIter* BlockBasedTable::NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& handle, - BlockIter* input_iter, bool is_index, GetContext* get_context, Status s) { + BlockIter* input_iter, bool is_index, bool key_includes_seq, + GetContext* get_context, Status s) { PERF_TIMER_GUARD(new_table_block_iter_nanos); const bool no_io = (ro.read_tier == kBlockCacheTier); @@ -1564,8 +1588,9 @@ BlockIter* BlockBasedTable::NewDataBlockIterator( if (s.ok()) { assert(block.value != nullptr); - iter = block.value->NewIterator(&rep->internal_comparator, iter, true, - rep->ioptions.statistics); + iter = block.value->NewIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + iter, true, rep->ioptions.statistics, key_includes_seq); if (block.cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, block.cache_handle); @@ -1677,8 +1702,11 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache( BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( BlockBasedTable* table, - std::unordered_map>* block_map) - : table_(table), block_map_(block_map) {} + std::unordered_map>* block_map, + bool index_key_includes_seq) + : table_(table), + block_map_(block_map), + index_key_includes_seq_(index_key_includes_seq) {} const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024; @@ -1701,8 +1729,9 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( assert(block_cache); RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, block_cache->GetUsage(block->second.cache_handle)); - return block->second.value->NewIterator(&rep->internal_comparator, nullptr, - true, rep->ioptions.statistics); + return block->second.value->NewIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + nullptr, true, rep->ioptions.statistics, index_key_includes_seq_); } // Create an empty iterator return new BlockIter(); @@ -1770,7 +1799,9 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, // and we're not really sure that we're past the end // of the file may_match = iiter->status().IsIncomplete(); - } else if (ExtractUserKey(iiter->key()) + } else if ((rep_->table_properties->index_key_is_user_key + ? iiter->key() + : ExtractUserKey(iiter->key())) .starts_with(ExtractUserKey(internal_prefix))) { // we need to check for this subtle case because our only // guarantee is that "the key is a string >= last key in that data @@ -1836,7 +1867,11 @@ void BlockBasedTableIterator::Seek(const Slice& target) { FindKeyForward(); assert(!data_block_iter_.Valid() || - icomp_.Compare(target, data_block_iter_.key()) <= 0); + (key_includes_seq_ && + icomp_.Compare(target, data_block_iter_.key()) <= 0) || + (!key_includes_seq_ && + icomp_.user_comparator()->Compare(ExtractUserKey(target), + data_block_iter_.key()) <= 0)); } void BlockBasedTableIterator::SeekForPrev(const Slice& target) { @@ -1952,7 +1987,8 @@ void BlockBasedTableIterator::InitDataBlock() { } BlockBasedTable::NewDataBlockIterator(rep, read_options_, data_block_handle, - &data_block_iter_, false, + &data_block_iter_, is_index_, + key_includes_seq_, /* get_context */ nullptr, s); block_iter_points_to_real_block_ = true; } @@ -2024,24 +2060,25 @@ InternalIterator* BlockBasedTable::NewIterator( Arena* arena, bool skip_filters) { bool prefix_extractor_changed = PrefixExtractorChanged(rep_->table_properties, prefix_extractor); + const bool kIsNotIndex = false; if (arena == nullptr) { return new BlockBasedTableIterator( this, read_options, rep_->internal_comparator, NewIndexIterator( read_options, prefix_extractor_changed && - rep_->index_type == BlockBasedTableOptions::kHashSearch), + rep_->index_type == BlockBasedTableOptions::kHashSearch), !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr && !prefix_extractor_changed, - prefix_extractor); + prefix_extractor != nullptr && !prefix_extractor_changed, + prefix_extractor, kIsNotIndex); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, NewIndexIterator(read_options, prefix_extractor_changed), !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr && !prefix_extractor_changed, - prefix_extractor); + prefix_extractor != nullptr && !prefix_extractor_changed, + prefix_extractor, kIsNotIndex); } } @@ -2061,7 +2098,8 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( assert(block_cache != nullptr); if (block_cache->Ref(rep_->range_del_entry.cache_handle)) { auto iter = rep_->range_del_entry.value->NewIterator( - &rep_->internal_comparator, nullptr /* iter */, + &rep_->internal_comparator, + rep_->internal_comparator.user_comparator(), nullptr /* iter */, true /* total_order_seek */, rep_->ioptions.statistics); iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, rep_->range_del_entry.cache_handle); @@ -2107,6 +2145,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters) { + assert(key.size() >= 8); // key must be internal key Status s; const bool no_io = read_options.read_tier == kBlockCacheTier; CachableEntry filter_entry; @@ -2215,6 +2254,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, Status BlockBasedTable::Prefetch(const Slice* const begin, const Slice* const end) { auto& comparator = rep_->internal_comparator; + auto user_comparator = comparator.user_comparator(); // pre-condition if (begin && end && comparator.Compare(*begin, *end) > 0) { return Status::InvalidArgument(*begin, *end); @@ -2238,8 +2278,11 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { Slice block_handle = iiter->value(); - - if (end && comparator.Compare(iiter->key(), *end) >= 0) { + const bool is_user_key = rep_->table_properties->index_key_is_user_key > 0; + if (end && + ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || + (is_user_key && + user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { if (prefetching_boundary_page) { break; } @@ -2392,12 +2435,14 @@ Status BlockBasedTable::CreateIndexReader( return PartitionIndexReader::Create( this, file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, icomparator, index_reader, - rep_->persistent_cache_options, level); + rep_->persistent_cache_options, level, + rep_->table_properties->index_key_is_user_key == 0); } case BlockBasedTableOptions::kBinarySearch: { return BinarySearchIndexReader::Create( file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, - icomparator, index_reader, rep_->persistent_cache_options); + icomparator, index_reader, rep_->persistent_cache_options, + rep_->table_properties->index_key_is_user_key == 0); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -2415,7 +2460,8 @@ Status BlockBasedTable::CreateIndexReader( return BinarySearchIndexReader::Create( file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, icomparator, index_reader, - rep_->persistent_cache_options); + rep_->persistent_cache_options, + rep_->table_properties->index_key_is_user_key == 0); } meta_index_iter = meta_iter_guard.get(); } @@ -2424,7 +2470,8 @@ Status BlockBasedTable::CreateIndexReader( rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer, rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter, index_reader, rep_->hash_index_allow_collision, - rep_->persistent_cache_options); + rep_->persistent_cache_options, + rep_->table_properties->index_key_is_user_key == 0); } default: { std::string error_message = @@ -2709,16 +2756,22 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { break; } Slice key = blockhandles_iter->key(); + Slice user_key; InternalKey ikey; - ikey.DecodeFrom(key); + if (rep_->table_properties->index_key_is_user_key == 0) { + ikey.DecodeFrom(key); + user_key = ikey.user_key(); + } else { + user_key = key; + } out_file->Append(" HEX "); - out_file->Append(ikey.user_key().ToString(true).c_str()); + out_file->Append(user_key.ToString(true).c_str()); out_file->Append(": "); out_file->Append(blockhandles_iter->value().ToString(true).c_str()); out_file->Append("\n"); - std::string str_key = ikey.user_key().ToString(); + std::string str_key = user_key.ToString(); std::string res_key(""); char cspace = ' '; for (size_t i = 0; i < str_key.size(); i++) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 4baed2a60b..6c39e567cf 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -217,11 +217,13 @@ class BlockBasedTable : public TableReader { const Slice& index_value, BlockIter* input_iter = nullptr, bool is_index = false, + bool key_includes_seq = true, GetContext* get_context = nullptr); static BlockIter* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, BlockIter* input_iter = nullptr, bool is_index = false, + bool key_includes_seq = true, GetContext* get_context = nullptr, Status s = Status()); @@ -378,13 +380,15 @@ class BlockBasedTable::PartitionedIndexIteratorState public: PartitionedIndexIteratorState( BlockBasedTable* table, - std::unordered_map>* block_map = nullptr); + std::unordered_map>* block_map, + const bool index_key_includes_seq); InternalIterator* NewSecondaryIterator(const Slice& index_value) override; private: // Don't own table_ BlockBasedTable* table_; std::unordered_map>* block_map_; + bool index_key_includes_seq_; }; // CachableEntry represents the entries that *may* be fetched from block cache. @@ -509,7 +513,8 @@ class BlockBasedTableIterator : public InternalIterator { const ReadOptions& read_options, const InternalKeyComparator& icomp, InternalIterator* index_iter, bool check_filter, - const SliceTransform* prefix_extractor) + const SliceTransform* prefix_extractor, bool is_index, + bool key_includes_seq = true) : table_(table), read_options_(read_options), icomp_(icomp), @@ -517,6 +522,8 @@ class BlockBasedTableIterator : public InternalIterator { pinned_iters_mgr_(nullptr), block_iter_points_to_real_block_(false), check_filter_(check_filter), + is_index_(is_index), + key_includes_seq_(key_includes_seq), prefix_extractor_(prefix_extractor) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -609,6 +616,10 @@ class BlockBasedTableIterator : public InternalIterator { bool block_iter_points_to_real_block_; bool is_out_of_bound_ = false; bool check_filter_; + // If the blocks over which we iterate are index blocks + bool is_index_; + // If the keys in the blocks over which we iterate include 8 byte sequence + bool key_includes_seq_; // TODO use block offset instead std::string prev_index_value_; const SliceTransform* prefix_extractor_; diff --git a/table/block_test.cc b/table/block_test.cc index 968951e6c3..be247ee209 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -99,7 +99,8 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; - InternalIterator *iter = reader.NewIterator(options.comparator); + InternalIterator *iter = + reader.NewIterator(options.comparator, options.comparator); for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) { // read kv from block @@ -113,7 +114,7 @@ TEST_F(BlockTest, SimpleTest) { delete iter; // read block contents randomly - iter = reader.NewIterator(options.comparator); + iter = reader.NewIterator(options.comparator, options.comparator); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array @@ -163,7 +164,7 @@ void CheckBlockContents(BlockContents contents, const int max_key, NewFixedPrefixTransform(prefix_size)); std::unique_ptr regular_iter( - reader2.NewIterator(BytewiseComparator())); + reader2.NewIterator(BytewiseComparator(), BytewiseComparator())); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -388,8 +389,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { // read contents of block sequentially size_t read_bytes = 0; - BlockIter *iter = static_cast( - reader.NewIterator(options.comparator, nullptr, true, stats.get())); + BlockIter *iter = static_cast(reader.NewIterator( + options.comparator, options.comparator, nullptr, true, stats.get())); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); read_bytes += iter->TEST_CurrentEntrySize(); @@ -421,8 +422,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - BlockIter *iter = static_cast( - reader.NewIterator(options.comparator, nullptr, true, stats.get())); + BlockIter *iter = static_cast(reader.NewIterator( + options.comparator, options.comparator, nullptr, true, stats.get())); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -457,8 +458,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - BlockIter *iter = static_cast( - reader.NewIterator(options.comparator, nullptr, true, stats.get())); + BlockIter *iter = static_cast(reader.NewIterator( + options.comparator, options.comparator, nullptr, true, stats.get())); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { int index = rnd.Uniform(num_records); diff --git a/table/index_builder.cc b/table/index_builder.cc index 08ce56e120..8a7fb8982d 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -31,13 +31,15 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( IndexBuilder* result = nullptr; switch (index_type) { case BlockBasedTableOptions::kBinarySearch: { - result = new ShortenedIndexBuilder(comparator, - table_opt.index_block_restart_interval); + result = new ShortenedIndexBuilder(comparator, + table_opt.index_block_restart_interval, + table_opt.format_version); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder(comparator, int_key_slice_transform, - table_opt.index_block_restart_interval); + table_opt.index_block_restart_interval, + table_opt.format_version); } break; case BlockBasedTableOptions::kTwoLevelIndexSearch: { @@ -62,9 +64,11 @@ PartitionedIndexBuilder::PartitionedIndexBuilder( const InternalKeyComparator* comparator, const BlockBasedTableOptions& table_opt) : IndexBuilder(comparator), - index_block_builder_(table_opt.index_block_restart_interval), + index_block_builder_(table_opt.index_block_restart_interval, + table_opt.format_version), sub_index_builder_(nullptr), - table_opt_(table_opt) {} + table_opt_(table_opt), + seperator_is_key_plus_seq_(false) {} PartitionedIndexBuilder::~PartitionedIndexBuilder() { delete sub_index_builder_; @@ -73,7 +77,8 @@ PartitionedIndexBuilder::~PartitionedIndexBuilder() { void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { assert(sub_index_builder_ == nullptr); sub_index_builder_ = new ShortenedIndexBuilder( - comparator_, table_opt_.index_block_restart_interval); + comparator_, table_opt_.index_block_restart_interval, + table_opt_.format_version); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, sub_index_builder_->index_block_builder_)); @@ -95,6 +100,10 @@ void PartitionedIndexBuilder::AddIndexEntry( } sub_index_builder_->AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); + if (sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders + seperator_is_key_plus_seq_ = true; + } sub_index_last_key_ = std::string(*last_key_in_current_block); entries_.push_back( {sub_index_last_key_, @@ -123,6 +132,10 @@ void PartitionedIndexBuilder::AddIndexEntry( sub_index_builder_->AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); sub_index_last_key_ = std::string(*last_key_in_current_block); + if (sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders + seperator_is_key_plus_seq_ = true; + } } } @@ -146,6 +159,8 @@ Status PartitionedIndexBuilder::Finish( // Finish the next partition index in line and Incomplete() to indicate we // expect more calls to Finish Entry& entry = entries_.front(); + // Apply the policy to all sub-indexes + entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; auto s = entry.value->Finish(index_blocks); finishing_indexes = true; return s.ok() ? Status::Incomplete() : s; diff --git a/table/index_builder.h b/table/index_builder.h index 3793cebc25..cde7496d4b 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -99,6 +99,8 @@ class IndexBuilder { // Get the estimated size for index block. virtual size_t EstimatedSize() const = 0; + virtual bool seperator_is_key_plus_seq() { return true; } + protected: const InternalKeyComparator* comparator_; }; @@ -115,9 +117,14 @@ class IndexBuilder { class ShortenedIndexBuilder : public IndexBuilder { public: explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, - int index_block_restart_interval) + int index_block_restart_interval, + uint32_t format_version) : IndexBuilder(comparator), - index_block_builder_(index_block_restart_interval) {} + index_block_builder_(index_block_restart_interval), + index_block_builder_without_seq_(index_block_restart_interval) { + // Making the default true will disable the feature for old versions + seperator_is_key_plus_seq_ = (format_version <= 2); + } virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, @@ -125,31 +132,57 @@ class ShortenedIndexBuilder : public IndexBuilder { if (first_key_in_next_block != nullptr) { comparator_->FindShortestSeparator(last_key_in_current_block, *first_key_in_next_block); + if (!seperator_is_key_plus_seq_ && + comparator_->user_comparator()->Compare( + ExtractUserKey(*last_key_in_current_block), + ExtractUserKey(*first_key_in_next_block)) == 0) { + seperator_is_key_plus_seq_ = true; + } } else { comparator_->FindShortSuccessor(last_key_in_current_block); } + auto sep = Slice(*last_key_in_current_block); std::string handle_encoding; block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(*last_key_in_current_block, handle_encoding); + index_block_builder_.Add(sep, handle_encoding); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(sep), + handle_encoding); + } } using IndexBuilder::Finish; virtual Status Finish( IndexBlocks* index_blocks, const BlockHandle& /*last_partition_block_handle*/) override { - index_blocks->index_block_contents = index_block_builder_.Finish(); + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } return Status::OK(); } virtual size_t EstimatedSize() const override { - return index_block_builder_.CurrentSizeEstimate(); + if (seperator_is_key_plus_seq_) { + return index_block_builder_.CurrentSizeEstimate(); + } else { + return index_block_builder_without_seq_.CurrentSizeEstimate(); + } + } + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; } friend class PartitionedIndexBuilder; private: BlockBuilder index_block_builder_; + BlockBuilder index_block_builder_without_seq_; + bool seperator_is_key_plus_seq_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -183,9 +216,11 @@ class HashIndexBuilder : public IndexBuilder { public: explicit HashIndexBuilder(const InternalKeyComparator* comparator, const SliceTransform* hash_key_extractor, - int index_block_restart_interval) + int index_block_restart_interval, + int format_version) : IndexBuilder(comparator), - primary_index_builder_(comparator, index_block_restart_interval), + primary_index_builder_(comparator, index_block_restart_interval, + format_version), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -240,6 +275,10 @@ class HashIndexBuilder : public IndexBuilder { prefix_meta_block_.size(); } + virtual bool seperator_is_key_plus_seq() override { + return primary_index_builder_.seperator_is_key_plus_seq(); + } + private: void FlushPendingPrefix() { prefix_block_.append(pending_entry_prefix_.data(), @@ -316,6 +355,10 @@ class PartitionedIndexBuilder : public IndexBuilder { // cutting the next partition void RequestPartitionCut(); + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + private: void MakeNewSubIndexBuilder(); @@ -333,6 +376,7 @@ class PartitionedIndexBuilder : public IndexBuilder { // true if Finish is called once but not complete yet. bool finishing_indexes = false; const BlockBasedTableOptions& table_opt_; + bool seperator_is_key_plus_seq_; // true if an external entity (such as filter partition builder) request // cutting the next partition bool partition_cut_requested_ = true; diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 63fceacff7..8210e6aabf 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -71,6 +71,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kIndexPartitions, props.index_partitions); Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); } + Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); Add(TablePropertiesNames::kNumEntries, props.num_entries); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); Add(TablePropertiesNames::kFilterSize, props.filter_size); @@ -192,7 +193,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, Block properties_block(std::move(block_contents), kDisableGlobalSequenceNumber); BlockIter iter; - properties_block.NewIterator(BytewiseComparator(), &iter); + properties_block.NewIterator(BytewiseComparator(), BytewiseComparator(), + &iter); auto new_table_properties = new TableProperties(); // All pre-defined properties of type uint64_t @@ -203,6 +205,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, &new_table_properties->index_partitions}, {TablePropertiesNames::kTopLevelIndexSize, &new_table_properties->top_level_index_size}, + {TablePropertiesNames::kIndexKeyIsUserKey, + &new_table_properties->index_key_is_user_key}, {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, {TablePropertiesNames::kRawValueSize, @@ -312,7 +316,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator())); + metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator())); // -- Read property block bool found_properties_block = true; @@ -375,7 +379,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); + meta_iter.reset( + metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator())); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } @@ -416,7 +421,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); + meta_iter.reset( + metaindex_block.NewIterator(BytewiseComparator(), BytewiseComparator())); BlockHandle block_handle; status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index 2f03f2fec9..2e03dc9799 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -113,7 +113,7 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; BlockIter biter; BlockHandle handle; - idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true); + idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true); biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { auto input = biter.value(); @@ -207,7 +207,7 @@ bool PartitionedFilterBlockReader::PrefixMayMatch( Slice PartitionedFilterBlockReader::GetFilterPartitionHandle( const Slice& entry) { BlockIter iter; - idx_on_fltr_blk_->NewIterator(&comparator_, &iter, true); + idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &iter, true); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { return Slice(); @@ -269,7 +269,7 @@ void PartitionedFilterBlockReader::CacheDependencies( auto rep = table_->rep_; BlockIter biter; BlockHandle handle; - idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true); + idx_on_fltr_blk_->NewIterator(&comparator_, &comparator_, &biter, true); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); diff --git a/table/table_properties.cc b/table/table_properties.cc index 306f5c17e4..4d75abdb31 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -90,7 +90,12 @@ std::string TableProperties::ToString( prop_delim, kv_delim); AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); - AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); + char index_block_size_str[80]; + snprintf(index_block_size_str, sizeof(index_block_size_str), + "index block size (user-key? %d)", + static_cast(index_key_is_user_key)); + AppendProperty(result, index_block_size_str, index_size, prop_delim, + kv_delim); if (index_partitions != 0) { AppendProperty(result, "# index partitions", index_partitions, prop_delim, kv_delim); @@ -155,6 +160,7 @@ void TableProperties::Add(const TableProperties& tp) { index_size += tp.index_size; index_partitions += tp.index_partitions; top_level_index_size += tp.top_level_index_size; + index_key_is_user_key += tp.index_key_is_user_key; filter_size += tp.filter_size; raw_key_size += tp.raw_key_size; raw_value_size += tp.raw_value_size; @@ -170,6 +176,8 @@ const std::string TablePropertiesNames::kIndexPartitions = "rocksdb.index.partitions"; const std::string TablePropertiesNames::kTopLevelIndexSize = "rocksdb.top-level.index.size"; +const std::string TablePropertiesNames::kIndexKeyIsUserKey = + "rocksdb.index.key.is.user.key"; const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size"; const std::string TablePropertiesNames::kRawKeySize = diff --git a/table/table_test.cc b/table/table_test.cc index 79a43a2c99..ab8ab016fc 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -237,7 +237,7 @@ class BlockConstructor: public Constructor { } virtual InternalIterator* NewIterator( const SliceTransform* /*prefix_extractor*/) const override { - return block_->NewIterator(comparator_); + return block_->NewIterator(comparator_, comparator_); } private: @@ -2115,7 +2115,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, nullptr, nullptr, nullptr); - ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context, + ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context, moptions4.prefix_extractor.get())); ASSERT_STREQ(value.data(), "hello"); BlockCachePropertiesSnapshot props(options.statistics.get()); @@ -2427,7 +2427,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) { ASSERT_OK(c.Reopen(ioptions1, moptions1)); auto table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { - ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); } c.ResetTableReader(); @@ -2439,7 +2440,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) { ASSERT_OK(c.Reopen(ioptions2, moptions2)); table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { - ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); } c.ResetTableReader(); } diff --git a/util/testutil.h b/util/testutil.h index d6b0a7095e..29f403ed83 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -87,13 +87,6 @@ class PlainInternalKeyComparator : public InternalKeyComparator { virtual int Compare(const Slice& a, const Slice& b) const override { return user_comparator()->Compare(a, b); } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const override { - user_comparator()->FindShortestSeparator(start, limit); - } - virtual void FindShortSuccessor(std::string* key) const override { - user_comparator()->FindShortSuccessor(key); - } }; #endif