Fix interaction between CompactionFilter::Decision::kRemoveAndSkipUnt…

Summary:
Fixes the following scenario:
 1. Set prefix extractor. Enable bloom filters, with `whole_key_filtering = false`. Use compaction filter that sometimes returns `kRemoveAndSkipUntil`.
 2. Do a compaction.
 3. Compaction creates an iterator with `total_order_seek = false`, calls `SeekToFirst()` on it, then repeatedly calls `Next()`.
 4. At some point compaction filter returns `kRemoveAndSkipUntil`.
 5. Compaction calls `Seek(skip_until)` on the iterator. The key that it seeks to happens to have prefix that doesn't match the bloom filter. Since `total_order_seek = false`, iterator becomes invalid, and compaction thinks that it has reached the end. The rest of the compaction input is silently discarded.

The fix is to make compaction iterator use `total_order_seek = true`.

The implementation for PlainTable is quite awkward. I've made `kRemoveAndSkipUntil` officially incompatible with PlainTable. If you try to use them together, compaction will fail, and DB will enter read-only mode (`bg_error_`). That's not a very graceful way to communicate a misconfiguration, but the alternatives don't seem worth the implementation time and complexity. To be able to check in advance that `kRemoveAndSkipUntil` is not going to be used with PlainTable, we'd need to extend the interface of either `CompactionFilter` or `InternalIterator`. It seems unlikely that anyone will ever want to use `kRemoveAndSkipUntil` with PlainTable: PlainTable probably has very few users, and `kRemoveAndSkipUntil` has only one user so far: us (logdevice).
Closes https://github.com/facebook/rocksdb/pull/2349

Differential Revision: D5110388

Pulled By: lightmark

fbshipit-source-id: ec29101a99d9dcd97db33923b87f72bce56cc17a
This commit is contained in:
Mike Kolupaev 2017-06-02 14:56:31 -07:00 committed by Facebook Github Bot
parent 95b0e89b5d
commit 138b87eae4
6 changed files with 74 additions and 28 deletions

View file

@ -780,7 +780,7 @@ TEST_F(DBTestCompactionFilter, SkipUntil) {
cfilter_skips = 0;
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
// Numberof skips in tables: 2, 3, 3, 3.
// Number of skips in tables: 2, 3, 3, 3.
ASSERT_EQ(11, cfilter_skips);
for (int table = 0; table < 4; ++table) {
@ -801,6 +801,43 @@ TEST_F(DBTestCompactionFilter, SkipUntil) {
}
}
TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
BlockBasedTableOptions table_options;
table_options.whole_key_filtering = false;
table_options.filter_policy.reset(NewBloomFilterPolicy(100, false));
Options options = CurrentOptions();
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.prefix_extractor.reset(NewCappedPrefixTransform(9));
options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
options.disable_auto_compactions = true;
options.create_if_missing = true;
DestroyAndReopen(options);
Put("0000000010", "v10");
Put("0000000020", "v20"); // skipped
Put("0000000050", "v50");
Flush();
cfilter_skips = 0;
EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
EXPECT_EQ(1, cfilter_skips);
Status s;
std::string val;
s = db_->Get(ReadOptions(), "0000000010", &val);
ASSERT_OK(s);
EXPECT_EQ("v10", val);
s = db_->Get(ReadOptions(), "0000000020", &val);
EXPECT_TRUE(s.IsNotFound());
s = db_->Get(ReadOptions(), "0000000050", &val);
ASSERT_OK(s);
EXPECT_EQ("v50", val);
}
} // namespace rocksdb
int main(int argc, char** argv) {

View file

@ -3495,9 +3495,11 @@ InternalIterator* VersionSet::MakeInputIterator(
ReadOptions read_options;
read_options.verify_checksums = true;
read_options.fill_cache = false;
if (c->ShouldFormSubcompactions()) {
read_options.total_order_seek = true;
}
// Compaction iterators shouldn't be confined to a single prefix.
// Compactions use Seek() for
// (a) concurrent compactions,
// (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
read_options.total_order_seek = true;
// Level-0 files have to be merged together. For other levels,
// we will make a concatenating iterator per level.

View file

@ -135,19 +135,19 @@ class CompactionFilter {
// *skip_until <= key is treated the same as Decision::kKeep
// (since the range [key, *skip_until) is empty).
//
// The keys are skipped even if there are snapshots containing them,
// as if IgnoreSnapshots() was true; i.e. values removed
// by kRemoveAndSkipUntil can disappear from a snapshot - beware
// if you're using TransactionDB or DB::GetSnapshot().
//
// Another warning: if value for a key was overwritten or merged into
// (multiple Put()s or Merge()s), and compaction filter skips this key
// with kRemoveAndSkipUntil, it's possible that it will remove only
// the new value, exposing the old value that was supposed to be
// overwritten.
//
// If you use kRemoveAndSkipUntil, consider also reducing
// compaction_readahead_size option.
// Caveats:
// - The keys are skipped even if there are snapshots containing them,
// as if IgnoreSnapshots() was true; i.e. values removed
// by kRemoveAndSkipUntil can disappear from a snapshot - beware
// if you're using TransactionDB or DB::GetSnapshot().
// - If value for a key was overwritten or merged into (multiple Put()s
// or Merge()s), and compaction filter skips this key with
// kRemoveAndSkipUntil, it's possible that it will remove only
// the new value, exposing the old value that was supposed to be
// overwritten.
// - Doesn't work with PlainTableFactory in prefix mode.
// - If you use kRemoveAndSkipUntil, consider also reducing
// compaction_readahead_size option.
//
// Note: If you are using a TransactionDB, it is not recommended to filter
// out or modify merge operands (ValueType::kMergeOperand).

View file

@ -47,6 +47,7 @@ static inline uint64_t CuckooHash(
// - Key length and Value length are fixed.
// - Does not support Snapshot.
// - Does not support Merge operations.
// - Does not support prefix bloom filters.
class CuckooTableFactory : public TableFactory {
public:
explicit CuckooTableFactory(const CuckooTableOptions& table_options)

View file

@ -372,10 +372,6 @@ InternalIterator* CuckooTableReader::NewIterator(
return NewErrorInternalIterator(
Status::Corruption("CuckooTableReader status is not okay."), arena);
}
if (read_options.total_order_seek) {
return NewErrorInternalIterator(
Status::InvalidArgument("total_order_seek is not supported."), arena);
}
CuckooTableIterator* iter;
if (arena == nullptr) {
iter = new CuckooTableIterator(this);

View file

@ -193,15 +193,12 @@ InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options,
Arena* arena,
const InternalKeyComparator*,
bool skip_filters) {
if (options.total_order_seek && !IsTotalOrderMode()) {
return NewErrorInternalIterator(
Status::InvalidArgument("total_order_seek not supported"), arena);
}
bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
if (arena == nullptr) {
return new PlainTableIterator(this, prefix_extractor_ != nullptr);
return new PlainTableIterator(this, use_prefix_seek);
} else {
auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
return new (mem) PlainTableIterator(this, prefix_extractor_ != nullptr);
return new (mem) PlainTableIterator(this, use_prefix_seek);
}
}
@ -641,9 +638,22 @@ void PlainTableIterator::SeekToLast() {
}
void PlainTableIterator::Seek(const Slice& target) {
if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
// This check is done here instead of NewIterator() to permit creating an
// iterator with total_order_seek = true even if we won't be able to Seek()
// it. This is needed for compaction: it creates iterator with
// total_order_seek = true but usually never does Seek() on it,
// only SeekToFirst().
status_ =
Status::InvalidArgument(
"total_order_seek not implemented for PlainTable.");
offset_ = next_offset_ = table_->file_info_.data_end_offset;
return;
}
// If the user doesn't set prefix seek option and we are not able to do a
// total Seek(). assert failure.
if (!use_prefix_seek_) {
if (table_->IsTotalOrderMode()) {
if (table_->full_scan_mode_) {
status_ =
Status::InvalidArgument("Seek() is not allowed in full scan mode.");