mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 16:30:56 +00:00
Prefix-based iterating only shows keys in prefix
Summary: MyRocks testing found an issue that while iterating over keys that are outside the prefix, sometimes wrong results were seen for keys outside the prefix. We now tighten the range of keys seen with a new read option called prefix_seen_at_start. This remembers the starting prefix and then compares it on a Next for equality of prefix. If they are from a different prefix, it sets valid to false. Test Plan: PrefixTest.PrefixValid Reviewers: IslamAbdelRahman, sdong, yhchiang, anthony Reviewed By: anthony Subscribers: spetrunia, hermanlee4, yoshinorim, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D50211
This commit is contained in:
parent
14c6e1a04c
commit
9d50afc3b9
|
@ -3573,10 +3573,11 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
|||
#else
|
||||
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
||||
auto iter = new ForwardIterator(this, read_options, cfd, sv);
|
||||
return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
|
||||
return NewDBIterator(
|
||||
env_, *cfd->ioptions(), cfd->user_comparator(), iter,
|
||||
kMaxSequenceNumber,
|
||||
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
read_options.iterate_upper_bound);
|
||||
read_options.iterate_upper_bound, read_options.prefix_same_as_start);
|
||||
#endif
|
||||
} else {
|
||||
SequenceNumber latest_snapshot = versions_->LastSequence();
|
||||
|
@ -3631,9 +3632,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
|||
// likely that any iterator pointer is close to the iterator it points to so
|
||||
// that they are likely to be in the same cache line and/or page.
|
||||
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
||||
env_, *cfd->ioptions(), cfd->user_comparator(),
|
||||
snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
read_options.iterate_upper_bound);
|
||||
env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
|
||||
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
||||
read_options.iterate_upper_bound, read_options.prefix_same_as_start);
|
||||
|
||||
InternalIterator* internal_iter =
|
||||
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
|
||||
|
|
|
@ -62,7 +62,8 @@ class DBIter: public Iterator {
|
|||
DBIter(Env* env, const ImmutableCFOptions& ioptions, const Comparator* cmp,
|
||||
InternalIterator* iter, SequenceNumber s, bool arena_mode,
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
const Slice* iterate_upper_bound = nullptr)
|
||||
const Slice* iterate_upper_bound = nullptr,
|
||||
bool prefix_same_as_start = false)
|
||||
: arena_mode_(arena_mode),
|
||||
env_(env),
|
||||
logger_(ioptions.info_log),
|
||||
|
@ -74,7 +75,8 @@ class DBIter: public Iterator {
|
|||
valid_(false),
|
||||
current_entry_is_merged_(false),
|
||||
statistics_(ioptions.statistics),
|
||||
iterate_upper_bound_(iterate_upper_bound) {
|
||||
iterate_upper_bound_(iterate_upper_bound),
|
||||
prefix_same_as_start_(prefix_same_as_start) {
|
||||
RecordTick(statistics_, NO_ITERATORS);
|
||||
prefix_extractor_ = ioptions.prefix_extractor;
|
||||
max_skip_ = max_sequential_skip_in_iterations;
|
||||
|
@ -155,6 +157,8 @@ class DBIter: public Iterator {
|
|||
Statistics* statistics_;
|
||||
uint64_t max_skip_;
|
||||
const Slice* iterate_upper_bound_;
|
||||
Slice prefix_start_;
|
||||
bool prefix_same_as_start_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
|
@ -197,6 +201,11 @@ void DBIter::Next() {
|
|||
RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
|
||||
}
|
||||
}
|
||||
if (valid_ && prefix_extractor_ && prefix_same_as_start_ &&
|
||||
prefix_extractor_->Transform(saved_key_.GetKey())
|
||||
.compare(prefix_start_) != 0) {
|
||||
valid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
// PRE: saved_key_ has the current user key if skipping
|
||||
|
@ -367,6 +376,11 @@ void DBIter::Prev() {
|
|||
RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
|
||||
}
|
||||
}
|
||||
if (valid_ && prefix_extractor_ && prefix_same_as_start_ &&
|
||||
prefix_extractor_->Transform(saved_key_.GetKey())
|
||||
.compare(prefix_start_) != 0) {
|
||||
valid_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::ReverseToBackward() {
|
||||
|
@ -668,6 +682,9 @@ void DBIter::Seek(const Slice& target) {
|
|||
} else {
|
||||
valid_ = false;
|
||||
}
|
||||
if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
|
||||
prefix_start_ = prefix_extractor_->Transform(target);
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::SeekToFirst() {
|
||||
|
@ -696,6 +713,9 @@ void DBIter::SeekToFirst() {
|
|||
} else {
|
||||
valid_ = false;
|
||||
}
|
||||
if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
|
||||
prefix_start_ = prefix_extractor_->Transform(saved_key_.GetKey());
|
||||
}
|
||||
}
|
||||
|
||||
void DBIter::SeekToLast() {
|
||||
|
@ -741,6 +761,9 @@ void DBIter::SeekToLast() {
|
|||
RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
|
||||
}
|
||||
}
|
||||
if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
|
||||
prefix_start_ = prefix_extractor_->Transform(saved_key_.GetKey());
|
||||
}
|
||||
}
|
||||
|
||||
Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
|
||||
|
@ -748,10 +771,11 @@ Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
|
|||
InternalIterator* internal_iter,
|
||||
const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
const Slice* iterate_upper_bound) {
|
||||
const Slice* iterate_upper_bound,
|
||||
bool prefix_same_as_start) {
|
||||
return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
|
||||
false, max_sequential_skip_in_iterations,
|
||||
iterate_upper_bound);
|
||||
iterate_upper_bound, prefix_same_as_start);
|
||||
}
|
||||
|
||||
ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
|
||||
|
@ -780,16 +804,16 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
|
|||
|
||||
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
||||
Env* env, const ImmutableCFOptions& ioptions,
|
||||
const Comparator* user_key_comparator,
|
||||
const SequenceNumber& sequence,
|
||||
const Comparator* user_key_comparator, const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
const Slice* iterate_upper_bound) {
|
||||
const Slice* iterate_upper_bound, bool prefix_same_as_start) {
|
||||
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
|
||||
Arena* arena = iter->GetArena();
|
||||
auto mem = arena->AllocateAligned(sizeof(DBIter));
|
||||
DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
|
||||
nullptr, sequence, true, max_sequential_skip_in_iterations,
|
||||
iterate_upper_bound);
|
||||
DBIter* db_iter =
|
||||
new (mem) DBIter(env, ioptions, user_key_comparator, nullptr, sequence,
|
||||
true, max_sequential_skip_in_iterations,
|
||||
iterate_upper_bound, prefix_same_as_start);
|
||||
|
||||
iter->SetDBIter(db_iter);
|
||||
|
||||
|
|
10
db/db_iter.h
10
db/db_iter.h
|
@ -29,7 +29,8 @@ extern Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& options,
|
|||
InternalIterator* internal_iter,
|
||||
const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
const Slice* iterate_upper_bound = nullptr);
|
||||
const Slice* iterate_upper_bound = nullptr,
|
||||
bool prefix_same_as_start = false);
|
||||
|
||||
// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
|
||||
// iterator is supposed be allocated. This class is used as an entry point of
|
||||
|
@ -71,8 +72,9 @@ class ArenaWrappedDBIter : public Iterator {
|
|||
// Generate the arena wrapped iterator class.
|
||||
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
||||
Env* env, const ImmutableCFOptions& options,
|
||||
const Comparator* user_key_comparator,
|
||||
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
|
||||
const Slice* iterate_upper_bound = nullptr);
|
||||
const Comparator* user_key_comparator, const SequenceNumber& sequence,
|
||||
uint64_t max_sequential_skip_in_iterations,
|
||||
const Slice* iterate_upper_bound = nullptr,
|
||||
bool prefix_same_as_start = false);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
|
|
@ -20,9 +20,11 @@ int main() {
|
|||
#include <gflags/gflags.h>
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "rocksdb/perf_context.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "util/histogram.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/string_util.h"
|
||||
|
@ -163,6 +165,12 @@ class PrefixTest : public testing::Test {
|
|||
options.memtable_prefix_bloom_huge_page_tlb_size =
|
||||
FLAGS_memtable_prefix_bloom_huge_page_tlb_size;
|
||||
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
|
||||
BlockBasedTableOptions bbto;
|
||||
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
||||
bbto.whole_key_filtering = false;
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
||||
|
||||
Status s = DB::Open(options, kDbName, &db);
|
||||
EXPECT_OK(s);
|
||||
return std::shared_ptr<DB>(db);
|
||||
|
@ -393,6 +401,55 @@ TEST_F(PrefixTest, TestResult) {
|
|||
}
|
||||
}
|
||||
|
||||
// Show results in prefix
|
||||
TEST_F(PrefixTest, PrefixValid) {
|
||||
for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
|
||||
FirstOption();
|
||||
while (NextOptions(num_buckets)) {
|
||||
std::cout << "*** Mem table: " << options.memtable_factory->Name()
|
||||
<< " number of buckets: " << num_buckets << std::endl;
|
||||
DestroyDB(kDbName, Options());
|
||||
auto db = OpenDb();
|
||||
WriteOptions write_options;
|
||||
ReadOptions read_options;
|
||||
|
||||
// Insert keys with common prefix and one key with different
|
||||
Slice v16("v16");
|
||||
Slice v17("v17");
|
||||
Slice v18("v18");
|
||||
Slice v19("v19");
|
||||
PutKey(db.get(), write_options, 12345, 6, v16);
|
||||
PutKey(db.get(), write_options, 12345, 7, v17);
|
||||
PutKey(db.get(), write_options, 12345, 8, v18);
|
||||
PutKey(db.get(), write_options, 12345, 9, v19);
|
||||
PutKey(db.get(), write_options, 12346, 8, v16);
|
||||
db->Flush(FlushOptions());
|
||||
db->Delete(write_options, TestKeyToSlice(TestKey(12346, 8)));
|
||||
db->Flush(FlushOptions());
|
||||
read_options.prefix_same_as_start = true;
|
||||
std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
|
||||
SeekIterator(iter.get(), 12345, 6);
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v16 == iter->value());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v17 == iter->value());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v18 == iter->value());
|
||||
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_TRUE(v19 == iter->value());
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(PrefixTest, DynamicPrefixIterator) {
|
||||
while (NextOptions(FLAGS_bucket_count)) {
|
||||
std::cout << "*** Mem table: " << options.memtable_factory->Name()
|
||||
|
|
|
@ -1290,6 +1290,14 @@ struct ReadOptions {
|
|||
// this option.
|
||||
bool total_order_seek;
|
||||
|
||||
// Enforce that the iterator only iterates over the same prefix as the seek.
|
||||
// This option is effective only for prefix seeks, i.e. prefix_extractor is
|
||||
// non-null for the column family and total_order_seek is false. Unlike
|
||||
// iterate_upper_bound, prefix_same_as_start only works within a prefix
|
||||
// but in both directions.
|
||||
// Default: false
|
||||
bool prefix_same_as_start;
|
||||
|
||||
ReadOptions();
|
||||
ReadOptions(bool cksum, bool cache);
|
||||
};
|
||||
|
|
|
@ -717,7 +717,8 @@ ReadOptions::ReadOptions()
|
|||
read_tier(kReadAllTier),
|
||||
tailing(false),
|
||||
managed(false),
|
||||
total_order_seek(false) {
|
||||
total_order_seek(false),
|
||||
prefix_same_as_start(false) {
|
||||
XFUNC_TEST("", "managed_options", managed_options, xf_manage_options,
|
||||
reinterpret_cast<ReadOptions*>(this));
|
||||
}
|
||||
|
@ -730,7 +731,8 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
|
|||
read_tier(kReadAllTier),
|
||||
tailing(false),
|
||||
managed(false),
|
||||
total_order_seek(false) {
|
||||
total_order_seek(false),
|
||||
prefix_same_as_start(false) {
|
||||
XFUNC_TEST("", "managed_options", managed_options, xf_manage_options,
|
||||
reinterpret_cast<ReadOptions*>(this));
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue