diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index 83cc3cfa50..22be567fdb 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init( const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, uint64_t version_number, ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { - auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) DBIter( - env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator, - /* iter */ nullptr, version, sequence, true, - max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index); - sv_number_ = version_number; read_options_ = read_options; - allow_refresh_ = allow_refresh; - memtable_range_tombstone_iter_ = nullptr; - if (!CheckFSFeatureSupport(env->GetFileSystem().get(), FSSupportedOps::kAsyncIO)) { read_options_.async_io = false; } + read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only; + + auto mem = arena_.AllocateAligned(sizeof(DBIter)); + db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options, + ioptions.user_comparator, + /* iter */ nullptr, version, sequence, true, + max_sequential_skip_in_iteration, read_callback, + cfh, expose_blob_index); + + sv_number_ = version_number; + allow_refresh_ = allow_refresh; + memtable_range_tombstone_iter_ = nullptr; } Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); } diff --git a/db/column_family.cc b/db/column_family.cc index 2b611fda7c..8abad2941f 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1201,8 +1201,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); merge_iter_builder.AddIterator(super_version->mem->NewIterator( - read_opts, /*seqno_to_time_mapping=*/nullptr, &arena)); + read_opts, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr, + /*prefix_extractor=*/nullptr, &merge_iter_builder, false /* add_range_tombstone_iter */); ScopedArenaPtr memtable_iter(merge_iter_builder.Finish()); diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 7b42f4ee5e..f0b3c40667 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -171,7 +171,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_OK(Put(1, "a", "b")); bool value_found = false; @@ -187,7 +187,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE( db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); - ASSERT_TRUE(!value_found); + ASSERT_FALSE(value_found); // assert that no new files were opened and no new blocks were // read into block cache. ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); @@ -197,7 +197,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -207,7 +207,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -215,7 +215,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); - ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "c", &value)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -2177,24 +2177,146 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { db_->ReleaseSnapshot(snapshot); } +namespace { +std::pair GetBloomStat(const Options& options, bool sst) { + if (sst) { + return {options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTER_MATCH), + options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTERED)}; + } else { + auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); + auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); + return {hit, miss}; + } +} -TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { - constexpr size_t kPrefixSize = 8; - const std::string kKey = "key"; - assert(kKey.size() < kPrefixSize); +std::pair HitAndMiss(uint64_t hits, uint64_t misses) { + return {hits, misses}; +} +} // namespace + +TEST_F(DBBloomFilterTest, MemtablePrefixBloom) { Options options = CurrentOptions(); - options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); options.memtable_prefix_bloom_size_ratio = 0.25; Reopen(options); - ASSERT_OK(Put(kKey, "v")); - ASSERT_EQ("v", Get(kKey)); - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); - iter->Seek(kKey); + ASSERT_FALSE(options.prefix_extractor->InDomain("key")); + ASSERT_OK(Put("key", "v")); + ASSERT_OK(Put("goat1", "g1")); + ASSERT_OK(Put("goat2", "g2")); + + // Reset from other tests + GetBloomStat(options, false); + + // Out of domain (Get) + ASSERT_EQ("v", Get("key")); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // In domain (Get) + ASSERT_EQ("g1", Get("goat1")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat9")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goan1")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + + ReadOptions ropts; + if (options.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + std::unique_ptr iter(db_->NewIterator(ropts)); + // Out of domain (scan) + iter->Seek("ke"); + ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(kKey, iter->key()); - iter->SeekForPrev(kKey); + ASSERT_EQ("key", iter->key()); + iter->SeekForPrev("kez"); + ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(kKey, iter->key()); + ASSERT_EQ("key", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // In domain (scan) + iter->Seek("goan"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + iter->Seek("goat"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + + // Changing prefix extractor should affect prefix query semantics + // and bypass the existing memtable Bloom filter + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}})); + iter.reset(db_->NewIterator(ropts)); + // Now out of domain (scan) + iter->Seek("goan"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain (scan) + iter->Seek("goat2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat2", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain (scan) + if (ropts.prefix_same_as_start) { + iter->Seek("goat0"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + } else { + // NOTE: legacy prefix Seek may return keys outside of prefix + } + + // Start a fresh new memtable, using new prefix extractor + ASSERT_OK(SingleDelete("key")); + ASSERT_OK(SingleDelete("goat1")); + ASSERT_OK(SingleDelete("goat2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "_v")); + ASSERT_OK(Put("goat1", "_g1")); + ASSERT_OK(Put("goat2", "_g2")); + + iter.reset(db_->NewIterator(ropts)); + + // Still out of domain (Get) + ASSERT_EQ("_v", Get("key")); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // Still in domain (Get) + ASSERT_EQ("_g1", Get("goat1")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat11")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat9")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goan1")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + + // Now out of domain (scan) + iter->Seek("goan"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain (scan) + iter->Seek("goat2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat2", iter->key()); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + // In domain (scan) + iter->Seek("goat0"); + ASSERT_OK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); } class DBBloomFilterTestVaryPrefixAndFormatVer @@ -2507,7 +2629,11 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_OK(Put(key1, value1, WriteOptions())); ASSERT_OK(Put(key3, value3, WriteOptions())); - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + ReadOptions ropts; + if (options_.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + std::unique_ptr iter(dbfull()->NewIterator(ropts)); // check memtable bloom stats iter->Seek(key1); @@ -2526,13 +2652,13 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { iter->Seek(key2); ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); + ASSERT_FALSE(iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); ASSERT_OK(Flush()); - iter.reset(dbfull()->NewIterator(ReadOptions())); + iter.reset(dbfull()->NewIterator(ropts)); // Check SST bloom stats iter->Seek(key1); @@ -2550,7 +2676,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { iter->Seek(key2); ASSERT_OK(iter->status()); - ASSERT_TRUE(!iter->Valid()); + ASSERT_FALSE(iter->Valid()); ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); } @@ -2659,9 +2785,14 @@ TEST_F(DBBloomFilterTest, PrefixScan) { PrefixScanInit(this); count = 0; env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); + ReadOptions ropts; + if (options.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + iter = db_->NewIterator(ropts); for (iter->Seek(prefix); iter->Valid(); iter->Next()) { if (!iter->key().starts_with(prefix)) { + ASSERT_FALSE(ropts.prefix_same_as_start); break; } count++; @@ -3397,23 +3528,6 @@ class FixedSuffix4Transform : public SliceTransform { bool InDomain(const Slice& src) const override { return src.size() >= 4; } }; - -std::pair GetBloomStat(const Options& options, bool sst) { - if (sst) { - return {options.statistics->getAndResetTickerCount( - NON_LAST_LEVEL_SEEK_FILTER_MATCH), - options.statistics->getAndResetTickerCount( - NON_LAST_LEVEL_SEEK_FILTERED)}; - } else { - auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); - auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); - return {hit, miss}; - } -} - -std::pair HitAndMiss(uint64_t hits, uint64_t misses) { - return {hits, misses}; -} } // anonymous namespace // This uses a prefix_extractor + comparator combination that violates @@ -3520,9 +3634,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } else { - // TODO: why needed? - get_perf_context()->bloom_memtable_hit_count = 0; - get_perf_context()->bloom_memtable_miss_count = 0; + // Reset from other tests + GetBloomStat(options, flushed); } EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { @@ -3664,9 +3777,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } else { - // TODO: why needed? - get_perf_context()->bloom_memtable_hit_count = 0; - get_perf_context()->bloom_memtable_miss_count = 0; + // Reset from other tests + GetBloomStat(options, flushed); } EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index e95561efab..b218bd0ba5 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2066,15 +2066,19 @@ InternalIterator* DBImpl::NewInternalIterator( bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) { InternalIterator* internal_iter; assert(arena != nullptr); + auto prefix_extractor = + super_version->mutable_cf_options.prefix_extractor.get(); // Need to create internal iterator from the arena. MergeIteratorBuilder merge_iter_builder( &cfd->internal_comparator(), arena, - !read_options.total_order_seek && - super_version->mutable_cf_options.prefix_extractor != nullptr, + // FIXME? It's not clear what interpretation of prefix seek is needed + // here, and no unit test cares about the value provided here. + !read_options.total_order_seek && prefix_extractor != nullptr, read_options.iterate_upper_bound); // Collect iterator for mutable memtable auto mem_iter = super_version->mem->NewIterator( - read_options, super_version->GetSeqnoToTimeMapping(), arena); + read_options, super_version->GetSeqnoToTimeMapping(), arena, + super_version->mutable_cf_options.prefix_extractor.get()); Status s; if (!read_options.ignore_range_deletions) { std::unique_ptr mem_tombstone_iter; @@ -2098,6 +2102,7 @@ InternalIterator* DBImpl::NewInternalIterator( if (s.ok()) { super_version->imm->AddIterators( read_options, super_version->GetSeqnoToTimeMapping(), + super_version->mutable_cf_options.prefix_extractor.get(), &merge_iter_builder, !read_options.ignore_range_deletions); } TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); @@ -3843,6 +3848,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, } } if (read_options.tailing) { + read_options.total_order_seek |= + immutable_db_options_.prefix_seek_opt_in_only; + auto iter = new ForwardIterator(this, read_options, cfd, sv, /* allow_unprepared_value */ true); result = NewDBIterator( @@ -4044,6 +4052,9 @@ Status DBImpl::NewIterators( assert(cf_sv_pairs.size() == column_families.size()); if (read_options.tailing) { + read_options.total_order_seek |= + immutable_db_options_.prefix_seek_opt_in_only; + for (const auto& cf_sv_pair : cf_sv_pairs) { auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd, cf_sv_pair.super_version, diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 4fb1008761..dec61c0504 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1669,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, TableProperties table_properties; { ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" " Level-0 table #%" PRIu64 ": started", diff --git a/db/db_iter.cc b/db/db_iter.cc index b42acc4bc6..e02586377f 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, valid_(false), current_entry_is_merged_(false), is_key_seqnum_zero_(false), - prefix_same_as_start_(mutable_cf_options.prefix_extractor - ? read_options.prefix_same_as_start - : false), + prefix_same_as_start_( + prefix_extractor_ ? read_options.prefix_same_as_start : false), pin_thru_lifetime_(read_options.pin_data), expect_total_order_inner_iter_(prefix_extractor_ == nullptr || read_options.total_order_seek || @@ -93,6 +92,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, status_.PermitUncheckedError(); assert(timestamp_size_ == user_comparator_.user_comparator()->timestamp_size()); + // prefix_seek_opt_in_only should force total_order_seek whereever the caller + // is duplicating the original ReadOptions + assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek); } Status DBIter::GetProperty(std::string prop_name, std::string* prop) { diff --git a/db/db_test2.cc b/db/db_test2.cc index 92211ad42d..fbe66a986b 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5597,32 +5597,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) { bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - // Construct two L1 files with keys: - // f1:[aaa1 ccc1] f2:[ddd0] - ASSERT_OK(Put("aaa1", "")); - ASSERT_OK(Put("ccc1", "")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("ddd0", "")); - ASSERT_OK(Flush()); - CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // This test is also the primary test for prefix_seek_opt_in_only + for (bool opt_in : {false, true}) { + options.prefix_seek_opt_in_only = opt_in; + DestroyAndReopen(options); - Iterator* iter = db_->NewIterator(ReadOptions()); - ASSERT_OK(iter->status()); + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - // Bloom filter is filterd out by f1. - // This is just one of several valid position following the contract. - // Postioning to ccc1 or ddd0 is also valid. This is just to validate - // the behavior of the current implementation. If underlying implementation - // changes, the test might fail here. - iter->Seek("bbb1"); - ASSERT_OK(iter->status()); - ASSERT_FALSE(iter->Valid()); + ReadOptions ropts; + for (bool same : {false, true}) { + ropts.prefix_same_as_start = same; + std::unique_ptr iter(db_->NewIterator(ropts)); + ASSERT_OK(iter->status()); - delete iter; + iter->Seek("bbb1"); + ASSERT_OK(iter->status()); + if (opt_in && !same) { + // Unbounded total order seek + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "ccc1"); + } else { + // Bloom filter is filterd out by f1. When same == false, this is just + // one valid position following the contract. Postioning to ccc1 or ddd0 + // is also valid. This is just to validate the behavior of the current + // implementation. If underlying implementation changes, the test might + // fail here. + ASSERT_FALSE(iter->Valid()); + } + } + } } TEST_F(DBTest2, RowCacheSnapshot) { @@ -5987,6 +6000,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) { // create a DB with block prefix index BlockBasedTableOptions table_options; Options options = CurrentOptions(); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek // Sometimes filter is checked based on upper bound. Assert counters // for that case. Otherwise, only check data correctness. diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index acb1c05c1d..60441da0b5 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { Options options = CurrentOptions(); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek options.env = env_; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(3)); @@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) { TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); DestroyAndReopen(options); const std::vector timestamps = {Timestamp(1, 1), Timestamp(0, 2), diff --git a/db/flush_job.cc b/db/flush_job.cc index 6bd71dd562..8206bd298a 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -421,8 +421,8 @@ Status FlushJob::MemPurge() { std::vector> range_del_iters; for (MemTable* m : mems_) { - memtables.push_back( - m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { @@ -893,8 +893,8 @@ Status FlushJob::WriteLevel0Table() { db_options_.info_log, "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); - memtables.push_back( - m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index a4cbdb4667..0bf7c15ab8 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -712,9 +712,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { UnownedPtr seqno_to_time_mapping = sv_->GetSeqnoToTimeMapping(); mutable_iter_ = - sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); - sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, - &arena_); + sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_, + sv_->mutable_cf_options.prefix_extractor.get()); + sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, + sv_->mutable_cf_options.prefix_extractor.get(), + &imm_iters_, &arena_); if (!read_options_.ignore_range_deletions) { std::unique_ptr range_del_iter( sv_->mem->NewRangeTombstoneIterator( @@ -781,9 +783,11 @@ void ForwardIterator::RenewIterators() { UnownedPtr seqno_to_time_mapping = svnew->GetSeqnoToTimeMapping(); mutable_iter_ = - svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); - svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, - &arena_); + svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_, + svnew->mutable_cf_options.prefix_extractor.get()); + svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, + svnew->mutable_cf_options.prefix_extractor.get(), + &imm_iters_, &arena_); ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); if (!read_options_.ignore_range_deletions) { diff --git a/db/memtable.cc b/db/memtable.cc index ef1184ded4..5ba0a0dacc 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -365,9 +365,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator : public InternalIterator { public: - MemTableIterator(const MemTable& mem, const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, - Arena* arena, bool use_range_del_table = false) + enum Kind { kPointEntries, kRangeDelEntries }; + MemTableIterator( + Kind kind, const MemTable& mem, const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping = nullptr, + Arena* arena = nullptr, + const SliceTransform* cf_prefix_extractor = nullptr) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), @@ -382,14 +385,21 @@ class MemTableIterator : public InternalIterator { arena_mode_(arena != nullptr), paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), allow_data_in_error(mem.moptions_.allow_data_in_errors) { - if (use_range_del_table) { + if (kind == kRangeDelEntries) { iter_ = mem.range_del_table_->GetIterator(arena); - } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && - !read_options.auto_prefix_mode) { + } else if (prefix_extractor_ != nullptr && + // NOTE: checking extractor equivalence when not pointer + // equivalent is arguably too expensive for memtable + prefix_extractor_ == cf_prefix_extractor && + (read_options.prefix_same_as_start || + (!read_options.total_order_seek && + !read_options.auto_prefix_mode))) { // Auto prefix mode is not implemented in memtable yet. + assert(kind == kPointEntries); bloom_ = mem.bloom_filter_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { + assert(kind == kPointEntries); iter_ = mem.table_->GetIterator(arena); } status_.PermitUncheckedError(); @@ -433,8 +443,8 @@ class MemTableIterator : public InternalIterator { // iterator should only use prefix bloom filter Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { - if (!bloom_->MayContain( - prefix_extractor_->Transform(user_k_without_ts))) { + Slice prefix = prefix_extractor_->Transform(user_k_without_ts); + if (!bloom_->MayContain(prefix)) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); valid_ = false; return; @@ -594,11 +604,13 @@ class MemTableIterator : public InternalIterator { InternalIterator* MemTable::NewIterator( const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, Arena* arena) { + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor) { assert(arena != nullptr); auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); return new (mem) - MemTableIterator(*this, read_options, seqno_to_time_mapping, arena); + MemTableIterator(MemTableIterator::kPointEntries, *this, read_options, + seqno_to_time_mapping, arena, prefix_extractor); } FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( @@ -633,8 +645,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( cache->reader_mutex.lock(); if (!cache->tombstones) { auto* unfragmented_iter = new MemTableIterator( - *this, read_options, nullptr /* seqno_to_time_mapping= */, - nullptr /* arena */, true /* use_range_del_table */); + MemTableIterator::kRangeDelEntries, *this, read_options); cache->tombstones.reset(new FragmentedRangeTombstoneList( std::unique_ptr(unfragmented_iter), comparator_.comparator)); @@ -655,8 +666,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { // TODO: plumb Env::IOActivity, Env::IOPriority auto* unfragmented_iter = new MemTableIterator( - *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/, - nullptr /* arena */, true /* use_range_del_table */); + MemTableIterator::kRangeDelEntries, *this, ReadOptions()); fragmented_range_tombstone_list_ = std::make_unique( diff --git a/db/memtable.h b/db/memtable.h index ca0652bc04..194b4543c2 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -210,7 +210,8 @@ class MemTable { // data, currently only needed for iterators serving user reads. InternalIterator* NewIterator( const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, Arena* arena); + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor); // Returns an iterator that yields the range tombstones of the memtable. // The caller must ensure that the underlying MemTable remains live diff --git a/db/memtable_list.cc b/db/memtable_list.cc index c3612656e2..c81c096b51 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -214,20 +214,23 @@ Status MemTableListVersion::AddRangeTombstoneIterators( void MemTableListVersion::AddIterators( const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, std::vector* iterator_list, Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back( - m->NewIterator(options, seqno_to_time_mapping, arena)); + iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping, + arena, prefix_extractor)); } } void MemTableListVersion::AddIterators( const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) { for (auto& m : memlist_) { - auto mem_iter = m->NewIterator(options, seqno_to_time_mapping, - merge_iter_builder->GetArena()); + auto mem_iter = + m->NewIterator(options, seqno_to_time_mapping, + merge_iter_builder->GetArena(), prefix_extractor); if (!add_range_tombstone_iter || options.ignore_range_deletions) { merge_iter_builder->AddIterator(mem_iter); } else { diff --git a/db/memtable_list.h b/db/memtable_list.h index dd439de559..390b4137dd 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -113,11 +113,13 @@ class MemTableListVersion { void AddIterators(const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter); diff --git a/db/repair.cc b/db/repair.cc index 114d36a6a8..f7f4fbafb7 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -443,7 +443,8 @@ class Repairer { ro.total_order_seek = true; Arena arena; ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); int64_t _current_time = 0; immutable_db_options_.clock->GetCurrentTime(&_current_time) .PermitUncheckedError(); // ignore error diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 8db8c32a0a..d09dd0167d 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -59,7 +59,7 @@ static std::string PrintContents(WriteBatch* b, InternalIterator* iter; if (i == 0) { iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr, - &arena); + &arena, /*prefix_extractor=*/nullptr); arena_iter_guard.reset(iter); } else { iter = mem->NewRangeTombstoneIterator(ReadOptions(), diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 5d4839df87..5a4c310c41 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1592,7 +1592,7 @@ Status StressTest::TestIterateImpl(ThreadState* thread, ro.total_order_seek = true; expect_total_order = true; } else if (thread->rand.OneIn(4)) { - ro.total_order_seek = false; + ro.total_order_seek = thread->rand.OneIn(2); ro.auto_prefix_mode = true; expect_total_order = true; } else if (options_.prefix_extractor.get() == nullptr) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 27feadb804..e272e3a69a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1399,6 +1399,17 @@ struct DBOptions { // eventually be obsolete and removed as Identity files are phased out. bool write_identity_file = true; + // Historically, when prefix_extractor != nullptr, iterators have an + // unfortunate default semantics of *possibly* only returning data + // within the same prefix. To avoid "spooky action at a distance," iterator + // bounds should come from the instantiation or seeking of the iterator, + // not from a mutable column family option. + // + // When set to true, it is as if every iterator is created with + // total_order_seek=true and only auto_prefix_mode=true and + // prefix_same_as_start=true can take advantage of prefix seek optimizations. + bool prefix_seek_opt_in_only = false; + // The number of bytes to prefetch when reading the log. This is mostly useful // for reading a remotely located log, as it can save the number of // round-trips. If 0, then the prefetching is disabled. @@ -1848,10 +1859,10 @@ struct ReadOptions { bool auto_prefix_mode = false; // Enforce that the iterator only iterates over the same prefix as the seek. - // This option is effective only for prefix seeks, i.e. prefix_extractor is - // non-null for the column family and total_order_seek is false. Unlike - // iterate_upper_bound, prefix_same_as_start only works within a prefix - // but in both directions. + // This makes the iterator bounds dependent on the column family's current + // prefix_extractor, which is mutable. When SST files have been built with + // the same prefix extractor, prefix filtering optimizations will be used + // for both Seek and SeekForPrev. bool prefix_same_as_start = false; // Keep the blocks loaded by the iterator pinned in memory as long as the diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 53f10998ca..ed456acc3e 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env, ROCKSDB_NAMESPACE::Arena arena; ROCKSDB_NAMESPACE::ScopedArenaPtr iter( mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), - /*seqno_to_time_mapping=*/nullptr, &arena)); + /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ROCKSDB_NAMESPACE::ParsedInternalKey ikey; ikey.clear(); diff --git a/options/db_options.cc b/options/db_options.cc index 0b880f4b93..e9e8fc5b7e 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -403,6 +403,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"prefix_seek_opt_in_only", + {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"write_dbid_to_manifest", {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -774,6 +778,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) background_close_inactive_wals(options.background_close_inactive_wals), atomic_flush(options.atomic_flush), avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), + prefix_seek_opt_in_only(options.prefix_seek_opt_in_only), persist_stats_to_disk(options.persist_stats_to_disk), write_dbid_to_manifest(options.write_dbid_to_manifest), write_identity_file(options.write_identity_file), @@ -948,6 +953,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.avoid_unnecessary_blocking_io: %d", avoid_unnecessary_blocking_io); + ROCKS_LOG_HEADER(log, " Options.prefix_seek_opt_in_only: %d", + prefix_seek_opt_in_only); ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", persist_stats_to_disk); ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d", diff --git a/options/db_options.h b/options/db_options.h index 842fefca86..ac76ea40d8 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -87,6 +87,7 @@ struct ImmutableDBOptions { bool background_close_inactive_wals; bool atomic_flush; bool avoid_unnecessary_blocking_io; + bool prefix_seek_opt_in_only; bool persist_stats_to_disk; bool write_dbid_to_manifest; bool write_identity_file; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index fe45224d08..0dfe3e38ab 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2063,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator( if (arena == nullptr) { return new BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), - !skip_filters && !read_options.total_order_seek && + !skip_filters && + (!read_options.total_order_seek || read_options.auto_prefix_mode || + read_options.prefix_same_as_start) && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size, allow_unprepared_value); @@ -2071,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator( auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), - !skip_filters && !read_options.total_order_seek && + !skip_filters && + (!read_options.total_order_seek || read_options.auto_prefix_mode || + read_options.prefix_same_as_start) && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size, allow_unprepared_value); diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index d3c968f73a..9d4e8eccf3 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator( assert(table_properties_); // Auto prefix mode is not implemented in PlainTable. - bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && - !options.auto_prefix_mode; + bool use_prefix_seek = + !IsTotalOrderMode() && + (options.prefix_same_as_start || + (!options.total_order_seek && !options.auto_prefix_mode)); if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); } else { diff --git a/table/table_test.cc b/table/table_test.cc index 9eee267614..bb0b70222c 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -534,7 +534,7 @@ class MemTableConstructor : public Constructor { const SliceTransform* /*prefix_extractor*/) const override { return new KeyConvertingIterator( memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr, - &arena_), + &arena_, /*prefix_extractor=*/nullptr), true); } @@ -4904,8 +4904,9 @@ TEST_F(MemTableTest, Simple) { std::unique_ptr iter_guard; InternalIterator* iter; if (i == 0) { - iter = GetMemTable()->NewIterator( - ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena); + iter = GetMemTable()->NewIterator(ReadOptions(), + /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr); arena_iter_guard.reset(iter); } else { iter = GetMemTable()->NewRangeTombstoneIterator( diff --git a/unreleased_history/bug_fixes/memtable_prefix.md b/unreleased_history/bug_fixes/memtable_prefix.md new file mode 100644 index 0000000000..d7b45c65ee --- /dev/null +++ b/unreleased_history/bug_fixes/memtable_prefix.md @@ -0,0 +1 @@ +* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected. diff --git a/unreleased_history/new_features/prefix_seek_opt_in_only.md b/unreleased_history/new_features/prefix_seek_opt_in_only.md new file mode 100644 index 0000000000..71c3cd6893 --- /dev/null +++ b/unreleased_history/new_features/prefix_seek_opt_in_only.md @@ -0,0 +1 @@ +* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.