From a1a102ffcefab83e75b9d026d204395d00f2a582 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Fri, 20 Sep 2024 15:54:19 -0700 Subject: [PATCH] Steps toward deprecating implicit prefix seek, related fixes (#13026) Summary: With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly. Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`. Related fixes / changes: * Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation). * Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.) Suggested follow-up: * Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness. * Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek. * Add `prefix_same_as_start` testing to db_stress Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026 Test Plan: tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while. Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all). Reviewed By: ltamasi Differential Revision: D63147378 Pulled By: pdillinger fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e --- db/arena_wrapped_db_iter.cc | 21 +- db/column_family.cc | 4 +- db/db_bloom_filter_test.cc | 202 ++++++++++++++---- db/db_impl/db_impl.cc | 17 +- db/db_impl/db_impl_open.cc | 3 +- db/db_iter.cc | 8 +- db/db_test2.cc | 58 +++-- db/db_with_timestamp_basic_test.cc | 2 + db/flush_job.cc | 8 +- db/forward_iterator.cc | 16 +- db/memtable.cc | 38 ++-- db/memtable.h | 3 +- db/memtable_list.cc | 11 +- db/memtable_list.h | 2 + db/repair.cc | 3 +- db/write_batch_test.cc | 2 +- db_stress_tool/db_stress_test_base.cc | 2 +- include/rocksdb/options.h | 19 +- java/rocksjni/write_batch_test.cc | 3 +- options/db_options.cc | 7 + options/db_options.h | 1 + table/block_based/block_based_table_reader.cc | 8 +- table/plain/plain_table_reader.cc | 6 +- table/table_test.cc | 7 +- .../bug_fixes/memtable_prefix.md | 1 + .../new_features/prefix_seek_opt_in_only.md | 1 + 26 files changed, 325 insertions(+), 128 deletions(-) create mode 100644 unreleased_history/bug_fixes/memtable_prefix.md create mode 100644 unreleased_history/new_features/prefix_seek_opt_in_only.md diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index 83cc3cfa50..22be567fdb 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init( const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, uint64_t version_number, ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { - auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) DBIter( - env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator, - /* iter */ nullptr, version, sequence, true, - max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index); - sv_number_ = version_number; read_options_ = read_options; - allow_refresh_ = allow_refresh; - memtable_range_tombstone_iter_ = nullptr; - if (!CheckFSFeatureSupport(env->GetFileSystem().get(), FSSupportedOps::kAsyncIO)) { read_options_.async_io = false; } + read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only; + + auto mem = arena_.AllocateAligned(sizeof(DBIter)); + db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options, + ioptions.user_comparator, + /* iter */ nullptr, version, sequence, true, + max_sequential_skip_in_iteration, read_callback, + cfh, expose_blob_index); + + sv_number_ = version_number; + allow_refresh_ = allow_refresh; + memtable_range_tombstone_iter_ = nullptr; } Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); } diff --git a/db/column_family.cc b/db/column_family.cc index 2b611fda7c..8abad2941f 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1201,8 +1201,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); merge_iter_builder.AddIterator(super_version->mem->NewIterator( - read_opts, /*seqno_to_time_mapping=*/nullptr, &arena)); + read_opts, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr, + /*prefix_extractor=*/nullptr, &merge_iter_builder, false /* add_range_tombstone_iter */); ScopedArenaPtr memtable_iter(merge_iter_builder.Finish()); diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 7b42f4ee5e..f0b3c40667 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -171,7 +171,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_OK(Put(1, "a", "b")); bool value_found = false; @@ -187,7 +187,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE( db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); - ASSERT_TRUE(!value_found); + ASSERT_FALSE(value_found); // assert that no new files were opened and no new blocks were // read into block cache. ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); @@ -197,7 +197,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -207,7 +207,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -215,7 +215,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "c", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -2177,24 +2177,146 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { db_->ReleaseSnapshot(snapshot); } +namespace { +std::pair GetBloomStat(const Options& options, bool sst) { + if (sst) { + return {options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTER_MATCH), + options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTERED)}; + } else { + auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); + auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); + return {hit, miss}; + } +} -TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { - constexpr size_t kPrefixSize = 8; - const std::string kKey = "key"; - assert(kKey.size() < kPrefixSize); +std::pair HitAndMiss(uint64_t hits, uint64_t misses) { + return {hits, misses}; +} +} // namespace + +TEST_F(DBBloomFilterTest, MemtablePrefixBloom) { Options options = CurrentOptions(); - options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); options.memtable_prefix_bloom_size_ratio = 0.25; Reopen(options); - ASSERT_OK(Put(kKey, "v")); - ASSERT_EQ("v", Get(kKey)); - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); - iter->Seek(kKey); + ASSERT_FALSE(options.prefix_extractor->InDomain("key")); + ASSERT_OK(Put("key", "v")); + ASSERT_OK(Put("goat1", "g1")); + ASSERT_OK(Put("goat2", "g2")); + + // Reset from other tests + GetBloomStat(options, false); + + // Out of domain (Get) + ASSERT_EQ("v", Get("key")); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // In domain (Get) + ASSERT_EQ("g1", Get("goat1")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat9")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goan1")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + + ReadOptions ropts; + if (options.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + std::unique_ptr iter(db_->NewIterator(ropts)); + // Out of domain (scan) + iter->Seek("ke"); + ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(kKey, iter->key()); - iter->SeekForPrev(kKey); + ASSERT_EQ("key", iter->key()); + iter->SeekForPrev("kez"); + ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(kKey, iter->key()); + ASSERT_EQ("key", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // In domain (scan) + iter->Seek("goan"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + iter->Seek("goat"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + + // Changing prefix extractor should affect prefix query semantics + // and bypass the existing memtable Bloom filter + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}})); + iter.reset(db_->NewIterator(ropts)); + // Now out of domain (scan) + iter->Seek("goan"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain (scan) + iter->Seek("goat2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat2", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain (scan) + if (ropts.prefix_same_as_start) { + iter->Seek("goat0"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + } else { + // NOTE: legacy prefix Seek may return keys outside of prefix + } + + // Start a fresh new memtable, using new prefix extractor + ASSERT_OK(SingleDelete("key")); + ASSERT_OK(SingleDelete("goat1")); + ASSERT_OK(SingleDelete("goat2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "_v")); + ASSERT_OK(Put("goat1", "_g1")); + ASSERT_OK(Put("goat2", "_g2")); + + iter.reset(db_->NewIterator(ropts)); + + // Still out of domain (Get) + ASSERT_EQ("_v", Get("key")); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // Still in domain (Get) + ASSERT_EQ("_g1", Get("goat1")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat11")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat9")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goan1")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + + // Now out of domain (scan) + iter->Seek("goan"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain (scan) + iter->Seek("goat2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat2", iter->key()); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + // In domain (scan) + iter->Seek("goat0"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); } class DBBloomFilterTestVaryPrefixAndFormatVer @@ -2507,7 +2629,11 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_OK(Put(key1, value1, WriteOptions())); ASSERT_OK(Put(key3, value3, WriteOptions())); - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + ReadOptions ropts; + if (options_.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + std::unique_ptr iter(dbfull()->NewIterator(ropts)); // check memtable bloom stats iter->Seek(key1); @@ -2526,13 +2652,13 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { iter->Seek(key2); ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); + ASSERT_FALSE(iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); ASSERT_OK(Flush()); - iter.reset(dbfull()->NewIterator(ReadOptions())); + iter.reset(dbfull()->NewIterator(ropts)); // Check SST bloom stats iter->Seek(key1); @@ -2550,7 +2676,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { iter->Seek(key2); ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); + ASSERT_FALSE(iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); } @@ -2659,9 +2785,14 @@ TEST_F(DBBloomFilterTest, PrefixScan) { PrefixScanInit(this); count = 0; env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); + ReadOptions ropts; + if (options.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + iter = db_->NewIterator(ropts); for (iter->Seek(prefix); iter->Valid(); iter->Next()) { if (!iter->key().starts_with(prefix)) { + ASSERT_FALSE(ropts.prefix_same_as_start); break; } count++; @@ -3397,23 +3528,6 @@ class FixedSuffix4Transform : public SliceTransform { bool InDomain(const Slice& src) const override { return src.size() >= 4; } }; - -std::pair GetBloomStat(const Options& options, bool sst) { - if (sst) { - return {options.statistics->getAndResetTickerCount( - NON_LAST_LEVEL_SEEK_FILTER_MATCH), - options.statistics->getAndResetTickerCount( - NON_LAST_LEVEL_SEEK_FILTERED)}; - } else { - auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); - auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); - return {hit, miss}; - } -} - -std::pair HitAndMiss(uint64_t hits, uint64_t misses) { - return {hits, misses}; -} } // anonymous namespace // This uses a prefix_extractor + comparator combination that violates @@ -3520,9 +3634,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } else { - // TODO: why needed? - get_perf_context()->bloom_memtable_hit_count = 0; - get_perf_context()->bloom_memtable_miss_count = 0; + // Reset from other tests + GetBloomStat(options, flushed); } EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { @@ -3664,9 +3777,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } else { - // TODO: why needed? - get_perf_context()->bloom_memtable_hit_count = 0; - get_perf_context()->bloom_memtable_miss_count = 0; + // Reset from other tests + GetBloomStat(options, flushed); } EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index e95561efab..b218bd0ba5 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2066,15 +2066,19 @@ InternalIterator* DBImpl::NewInternalIterator( bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) { InternalIterator* internal_iter; assert(arena != nullptr); + auto prefix_extractor = + super_version->mutable_cf_options.prefix_extractor.get(); // Need to create internal iterator from the arena. MergeIteratorBuilder merge_iter_builder( &cfd->internal_comparator(), arena, - !read_options.total_order_seek && - super_version->mutable_cf_options.prefix_extractor != nullptr, + // FIXME? It's not clear what interpretation of prefix seek is needed + // here, and no unit test cares about the value provided here. + !read_options.total_order_seek && prefix_extractor != nullptr, read_options.iterate_upper_bound); // Collect iterator for mutable memtable auto mem_iter = super_version->mem->NewIterator( - read_options, super_version->GetSeqnoToTimeMapping(), arena); + read_options, super_version->GetSeqnoToTimeMapping(), arena, + super_version->mutable_cf_options.prefix_extractor.get()); Status s; if (!read_options.ignore_range_deletions) { std::unique_ptr mem_tombstone_iter; @@ -2098,6 +2102,7 @@ InternalIterator* DBImpl::NewInternalIterator( if (s.ok()) { super_version->imm->AddIterators( read_options, super_version->GetSeqnoToTimeMapping(), + super_version->mutable_cf_options.prefix_extractor.get(), &merge_iter_builder, !read_options.ignore_range_deletions); } TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); @@ -3843,6 +3848,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, } } if (read_options.tailing) { + read_options.total_order_seek |= + immutable_db_options_.prefix_seek_opt_in_only; + auto iter = new ForwardIterator(this, read_options, cfd, sv, /* allow_unprepared_value */ true); result = NewDBIterator( @@ -4044,6 +4052,9 @@ Status DBImpl::NewIterators( assert(cf_sv_pairs.size() == column_families.size()); if (read_options.tailing) { + read_options.total_order_seek |= + immutable_db_options_.prefix_seek_opt_in_only; + for (const auto& cf_sv_pair : cf_sv_pairs) { auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd, cf_sv_pair.super_version, diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 4fb1008761..dec61c0504 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1669,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, TableProperties table_properties; { ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" " Level-0 table #%" PRIu64 ": started", diff --git a/db/db_iter.cc b/db/db_iter.cc index b42acc4bc6..e02586377f 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, valid_(false), current_entry_is_merged_(false), is_key_seqnum_zero_(false), - prefix_same_as_start_(mutable_cf_options.prefix_extractor - ? read_options.prefix_same_as_start - : false), + prefix_same_as_start_( + prefix_extractor_ ? read_options.prefix_same_as_start : false), pin_thru_lifetime_(read_options.pin_data), expect_total_order_inner_iter_(prefix_extractor_ == nullptr || read_options.total_order_seek || @@ -93,6 +92,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, status_.PermitUncheckedError(); assert(timestamp_size_ == user_comparator_.user_comparator()->timestamp_size()); + // prefix_seek_opt_in_only should force total_order_seek whereever the caller + // is duplicating the original ReadOptions + assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek); } Status DBIter::GetProperty(std::string prop_name, std::string* prop) { diff --git a/db/db_test2.cc b/db/db_test2.cc index 92211ad42d..fbe66a986b 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5597,32 +5597,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) { bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - // Construct two L1 files with keys: - // f1:[aaa1 ccc1] f2:[ddd0] - ASSERT_OK(Put("aaa1", "")); - ASSERT_OK(Put("ccc1", "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("ddd0", "")); - ASSERT_OK(Flush()); - CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // This test is also the primary test for prefix_seek_opt_in_only + for (bool opt_in : {false, true}) { + options.prefix_seek_opt_in_only = opt_in; + DestroyAndReopen(options); - Iterator* iter = db_->NewIterator(ReadOptions()); - ASSERT_OK(iter->status()); + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - // Bloom filter is filterd out by f1. - // This is just one of several valid position following the contract. - // Postioning to ccc1 or ddd0 is also valid. This is just to validate - // the behavior of the current implementation. If underlying implementation - // changes, the test might fail here. - iter->Seek("bbb1"); - ASSERT_OK(iter->status()); - ASSERT_FALSE(iter->Valid()); + ReadOptions ropts; + for (bool same : {false, true}) { + ropts.prefix_same_as_start = same; + std::unique_ptr iter(db_->NewIterator(ropts)); + ASSERT_OK(iter->status()); - delete iter; + iter->Seek("bbb1"); + ASSERT_OK(iter->status()); + if (opt_in && !same) { + // Unbounded total order seek + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "ccc1"); + } else { + // Bloom filter is filterd out by f1. When same == false, this is just + // one valid position following the contract. Postioning to ccc1 or ddd0 + // is also valid. This is just to validate the behavior of the current + // implementation. If underlying implementation changes, the test might + // fail here. + ASSERT_FALSE(iter->Valid()); + } + } + } } TEST_F(DBTest2, RowCacheSnapshot) { @@ -5987,6 +6000,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) { // create a DB with block prefix index BlockBasedTableOptions table_options; Options options = CurrentOptions(); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek // Sometimes filter is checked based on upper bound. Assert counters // for that case. Otherwise, only check data correctness. diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index acb1c05c1d..60441da0b5 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { Options options = CurrentOptions(); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek options.env = env_; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(3)); @@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) { TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); DestroyAndReopen(options); const std::vector timestamps = {Timestamp(1, 1), Timestamp(0, 2), diff --git a/db/flush_job.cc b/db/flush_job.cc index 6bd71dd562..8206bd298a 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -421,8 +421,8 @@ Status FlushJob::MemPurge() { std::vector> range_del_iters; for (MemTable* m : mems_) { - memtables.push_back( - m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { @@ -893,8 +893,8 @@ Status FlushJob::WriteLevel0Table() { db_options_.info_log, "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); - memtables.push_back( - m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index a4cbdb4667..0bf7c15ab8 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -712,9 +712,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { UnownedPtr seqno_to_time_mapping = sv_->GetSeqnoToTimeMapping(); mutable_iter_ = - sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); - sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, - &arena_); + sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_, + sv_->mutable_cf_options.prefix_extractor.get()); + sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, + sv_->mutable_cf_options.prefix_extractor.get(), + &imm_iters_, &arena_); if (!read_options_.ignore_range_deletions) { std::unique_ptr range_del_iter( sv_->mem->NewRangeTombstoneIterator( @@ -781,9 +783,11 @@ void ForwardIterator::RenewIterators() { UnownedPtr seqno_to_time_mapping = svnew->GetSeqnoToTimeMapping(); mutable_iter_ = - svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); - svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, - &arena_); + svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_, + svnew->mutable_cf_options.prefix_extractor.get()); + svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, + svnew->mutable_cf_options.prefix_extractor.get(), + &imm_iters_, &arena_); ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); if (!read_options_.ignore_range_deletions) { diff --git a/db/memtable.cc b/db/memtable.cc index ef1184ded4..5ba0a0dacc 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -365,9 +365,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator : public InternalIterator { public: - MemTableIterator(const MemTable& mem, const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, - Arena* arena, bool use_range_del_table = false) + enum Kind { kPointEntries, kRangeDelEntries }; + MemTableIterator( + Kind kind, const MemTable& mem, const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping = nullptr, + Arena* arena = nullptr, + const SliceTransform* cf_prefix_extractor = nullptr) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), @@ -382,14 +385,21 @@ class MemTableIterator : public InternalIterator { arena_mode_(arena != nullptr), paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), allow_data_in_error(mem.moptions_.allow_data_in_errors) { - if (use_range_del_table) { + if (kind == kRangeDelEntries) { iter_ = mem.range_del_table_->GetIterator(arena); - } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && - !read_options.auto_prefix_mode) { + } else if (prefix_extractor_ != nullptr && + // NOTE: checking extractor equivalence when not pointer + // equivalent is arguably too expensive for memtable + prefix_extractor_ == cf_prefix_extractor && + (read_options.prefix_same_as_start || + (!read_options.total_order_seek && + !read_options.auto_prefix_mode))) { // Auto prefix mode is not implemented in memtable yet. + assert(kind == kPointEntries); bloom_ = mem.bloom_filter_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { + assert(kind == kPointEntries); iter_ = mem.table_->GetIterator(arena); } status_.PermitUncheckedError(); @@ -433,8 +443,8 @@ class MemTableIterator : public InternalIterator { // iterator should only use prefix bloom filter Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { - if (!bloom_->MayContain( - prefix_extractor_->Transform(user_k_without_ts))) { + Slice prefix = prefix_extractor_->Transform(user_k_without_ts); + if (!bloom_->MayContain(prefix)) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); valid_ = false; return; @@ -594,11 +604,13 @@ class MemTableIterator : public InternalIterator { InternalIterator* MemTable::NewIterator( const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, Arena* arena) { + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor) { assert(arena != nullptr); auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); return new (mem) - MemTableIterator(*this, read_options, seqno_to_time_mapping, arena); + MemTableIterator(MemTableIterator::kPointEntries, *this, read_options, + seqno_to_time_mapping, arena, prefix_extractor); } FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( @@ -633,8 +645,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( cache->reader_mutex.lock(); if (!cache->tombstones) { auto* unfragmented_iter = new MemTableIterator( - *this, read_options, nullptr /* seqno_to_time_mapping= */, - nullptr /* arena */, true /* use_range_del_table */); + MemTableIterator::kRangeDelEntries, *this, read_options); cache->tombstones.reset(new FragmentedRangeTombstoneList( std::unique_ptr(unfragmented_iter), comparator_.comparator)); @@ -655,8 +666,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { // TODO: plumb Env::IOActivity, Env::IOPriority auto* unfragmented_iter = new MemTableIterator( - *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/, - nullptr /* arena */, true /* use_range_del_table */); + MemTableIterator::kRangeDelEntries, *this, ReadOptions()); fragmented_range_tombstone_list_ = std::make_unique( diff --git a/db/memtable.h b/db/memtable.h index ca0652bc04..194b4543c2 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -210,7 +210,8 @@ class MemTable { // data, currently only needed for iterators serving user reads. InternalIterator* NewIterator( const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, Arena* arena); + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor); // Returns an iterator that yields the range tombstones of the memtable. // The caller must ensure that the underlying MemTable remains live diff --git a/db/memtable_list.cc b/db/memtable_list.cc index c3612656e2..c81c096b51 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -214,20 +214,23 @@ Status MemTableListVersion::AddRangeTombstoneIterators( void MemTableListVersion::AddIterators( const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, std::vector* iterator_list, Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back( - m->NewIterator(options, seqno_to_time_mapping, arena)); + iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping, + arena, prefix_extractor)); } } void MemTableListVersion::AddIterators( const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) { for (auto& m : memlist_) { - auto mem_iter = m->NewIterator(options, seqno_to_time_mapping, - merge_iter_builder->GetArena()); + auto mem_iter = + m->NewIterator(options, seqno_to_time_mapping, + merge_iter_builder->GetArena(), prefix_extractor); if (!add_range_tombstone_iter || options.ignore_range_deletions) { merge_iter_builder->AddIterator(mem_iter); } else { diff --git a/db/memtable_list.h b/db/memtable_list.h index dd439de559..390b4137dd 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -113,11 +113,13 @@ class MemTableListVersion { void AddIterators(const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter); diff --git a/db/repair.cc b/db/repair.cc index 114d36a6a8..f7f4fbafb7 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -443,7 +443,8 @@ class Repairer { ro.total_order_seek = true; Arena arena; ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); int64_t _current_time = 0; immutable_db_options_.clock->GetCurrentTime(&_current_time) .PermitUncheckedError(); // ignore error diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 8db8c32a0a..d09dd0167d 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -59,7 +59,7 @@ static std::string PrintContents(WriteBatch* b, InternalIterator* iter; if (i == 0) { iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr, - &arena); + &arena, /*prefix_extractor=*/nullptr); arena_iter_guard.reset(iter); } else { iter = mem->NewRangeTombstoneIterator(ReadOptions(), diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 5d4839df87..5a4c310c41 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1592,7 +1592,7 @@ Status StressTest::TestIterateImpl(ThreadState* thread, ro.total_order_seek = true; expect_total_order = true; } else if (thread->rand.OneIn(4)) { - ro.total_order_seek = false; + ro.total_order_seek = thread->rand.OneIn(2); ro.auto_prefix_mode = true; expect_total_order = true; } else if (options_.prefix_extractor.get() == nullptr) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 27feadb804..e272e3a69a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1399,6 +1399,17 @@ struct DBOptions { // eventually be obsolete and removed as Identity files are phased out. bool write_identity_file = true; + // Historically, when prefix_extractor != nullptr, iterators have an + // unfortunate default semantics of *possibly* only returning data + // within the same prefix. To avoid "spooky action at a distance," iterator + // bounds should come from the instantiation or seeking of the iterator, + // not from a mutable column family option. + // + // When set to true, it is as if every iterator is created with + // total_order_seek=true and only auto_prefix_mode=true and + // prefix_same_as_start=true can take advantage of prefix seek optimizations. + bool prefix_seek_opt_in_only = false; + // The number of bytes to prefetch when reading the log. This is mostly useful // for reading a remotely located log, as it can save the number of // round-trips. If 0, then the prefetching is disabled. @@ -1848,10 +1859,10 @@ struct ReadOptions { bool auto_prefix_mode = false; // Enforce that the iterator only iterates over the same prefix as the seek. - // This option is effective only for prefix seeks, i.e. prefix_extractor is - // non-null for the column family and total_order_seek is false. Unlike - // iterate_upper_bound, prefix_same_as_start only works within a prefix - // but in both directions. + // This makes the iterator bounds dependent on the column family's current + // prefix_extractor, which is mutable. When SST files have been built with + // the same prefix extractor, prefix filtering optimizations will be used + // for both Seek and SeekForPrev. bool prefix_same_as_start = false; // Keep the blocks loaded by the iterator pinned in memory as long as the diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 53f10998ca..ed456acc3e 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env, ROCKSDB_NAMESPACE::Arena arena; ROCKSDB_NAMESPACE::ScopedArenaPtr iter( mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), - /*seqno_to_time_mapping=*/nullptr, &arena)); + /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ROCKSDB_NAMESPACE::ParsedInternalKey ikey; ikey.clear(); diff --git a/options/db_options.cc b/options/db_options.cc index 0b880f4b93..e9e8fc5b7e 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -403,6 +403,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"prefix_seek_opt_in_only", + {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"write_dbid_to_manifest", {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -774,6 +778,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) background_close_inactive_wals(options.background_close_inactive_wals), atomic_flush(options.atomic_flush), avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), + prefix_seek_opt_in_only(options.prefix_seek_opt_in_only), persist_stats_to_disk(options.persist_stats_to_disk), write_dbid_to_manifest(options.write_dbid_to_manifest), write_identity_file(options.write_identity_file), @@ -948,6 +953,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.avoid_unnecessary_blocking_io: %d", avoid_unnecessary_blocking_io); + ROCKS_LOG_HEADER(log, " Options.prefix_seek_opt_in_only: %d", + prefix_seek_opt_in_only); ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", persist_stats_to_disk); ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d", diff --git a/options/db_options.h b/options/db_options.h index 842fefca86..ac76ea40d8 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -87,6 +87,7 @@ struct ImmutableDBOptions { bool background_close_inactive_wals; bool atomic_flush; bool avoid_unnecessary_blocking_io; + bool prefix_seek_opt_in_only; bool persist_stats_to_disk; bool write_dbid_to_manifest; bool write_identity_file; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index fe45224d08..0dfe3e38ab 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2063,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator( if (arena == nullptr) { return new BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), - !skip_filters && !read_options.total_order_seek && + !skip_filters && + (!read_options.total_order_seek || read_options.auto_prefix_mode || + read_options.prefix_same_as_start) && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size, allow_unprepared_value); @@ -2071,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator( auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), - !skip_filters && !read_options.total_order_seek && + !skip_filters && + (!read_options.total_order_seek || read_options.auto_prefix_mode || + read_options.prefix_same_as_start) && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size, allow_unprepared_value); diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index d3c968f73a..9d4e8eccf3 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator( assert(table_properties_); // Auto prefix mode is not implemented in PlainTable. - bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && - !options.auto_prefix_mode; + bool use_prefix_seek = + !IsTotalOrderMode() && + (options.prefix_same_as_start || + (!options.total_order_seek && !options.auto_prefix_mode)); if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); } else { diff --git a/table/table_test.cc b/table/table_test.cc index 9eee267614..bb0b70222c 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -534,7 +534,7 @@ class MemTableConstructor : public Constructor { const SliceTransform* /*prefix_extractor*/) const override { return new KeyConvertingIterator( memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr, - &arena_), + &arena_, /*prefix_extractor=*/nullptr), true); } @@ -4904,8 +4904,9 @@ TEST_F(MemTableTest, Simple) { std::unique_ptr iter_guard; InternalIterator* iter; if (i == 0) { - iter = GetMemTable()->NewIterator( - ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena); + iter = GetMemTable()->NewIterator(ReadOptions(), + /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr); arena_iter_guard.reset(iter); } else { iter = GetMemTable()->NewRangeTombstoneIterator( diff --git a/unreleased_history/bug_fixes/memtable_prefix.md b/unreleased_history/bug_fixes/memtable_prefix.md new file mode 100644 index 0000000000..d7b45c65ee --- /dev/null +++ b/unreleased_history/bug_fixes/memtable_prefix.md @@ -0,0 +1 @@ +* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected. diff --git a/unreleased_history/new_features/prefix_seek_opt_in_only.md b/unreleased_history/new_features/prefix_seek_opt_in_only.md new file mode 100644 index 0000000000..71c3cd6893 --- /dev/null +++ b/unreleased_history/new_features/prefix_seek_opt_in_only.md @@ -0,0 +1 @@ +* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.