mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 16:30:56 +00:00
014fd55adc
Summary: This patch fixes #7460559. It introduces SingleDelete as a new database operation. This operation can be used to delete keys that were never overwritten (no put following another put of the same key). If an overwritten key is single deleted the behavior is undefined. Single deletion of a non-existent key has no effect but multiple consecutive single deletions are not allowed (see limitations). In contrast to the conventional Delete() operation, the deletion entry is removed along with the value when the two are lined up in a compaction. Note: The semantics are similar to @igor's prototype that allowed to have this behavior on the granularity of a column family ( https://reviews.facebook.net/D42093 ). This new patch, however, is more aggressive when it comes to removing tombstones: It removes the SingleDelete together with the value whenever there is no snapshot between them while the older patch only did this when the sequence number of the deletion was older than the earliest snapshot. Most of the complex additions are in the Compaction Iterator, all other changes should be relatively straightforward. The patch also includes basic support for single deletions in db_stress and db_bench. Limitations: - Not compatible with cuckoo hash tables - Single deletions cannot be used in combination with merges and normal deletions on the same key (other keys are not affected by this) - Consecutive single deletions are currently not allowed (and older version of this patch supported this so it could be resurrected if needed) Test Plan: make all check Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor Reviewed By: igor Subscribers: maykov, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D43179
141 lines
5.2 KiB
C++
141 lines
5.2 KiB
C++
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <deque>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "db/compaction.h"
|
|
#include "db/merge_helper.h"
|
|
#include "rocksdb/compaction_filter.h"
|
|
#include "util/log_buffer.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
struct CompactionIteratorStats {
|
|
// Compaction statistics
|
|
int64_t num_record_drop_user = 0;
|
|
int64_t num_record_drop_hidden = 0;
|
|
int64_t num_record_drop_obsolete = 0;
|
|
uint64_t total_filter_time = 0;
|
|
|
|
// Input statistics
|
|
// TODO(noetzli): The stats are incomplete. They are lacking everything
|
|
// consumed by MergeHelper.
|
|
uint64_t num_input_records = 0;
|
|
uint64_t num_input_deletion_records = 0;
|
|
uint64_t num_input_corrupt_records = 0;
|
|
uint64_t total_input_raw_key_bytes = 0;
|
|
uint64_t total_input_raw_value_bytes = 0;
|
|
};
|
|
|
|
class CompactionIterator {
|
|
public:
|
|
CompactionIterator(Iterator* input, const Comparator* cmp,
|
|
MergeHelper* merge_helper, SequenceNumber last_sequence,
|
|
std::vector<SequenceNumber>* snapshots, Env* env,
|
|
bool expect_valid_internal_key,
|
|
Statistics* stats = nullptr,
|
|
Compaction* compaction = nullptr,
|
|
const CompactionFilter* compaction_filter = nullptr,
|
|
LogBuffer* log_buffer = nullptr);
|
|
|
|
void ResetRecordCounts();
|
|
|
|
// Seek to the beginning of the compaction iterator output.
|
|
//
|
|
// REQUIRED: Call only once.
|
|
void SeekToFirst();
|
|
|
|
// Produces the next record in the compaction.
|
|
//
|
|
// REQUIRED: SeekToFirst() has been called.
|
|
void Next();
|
|
|
|
// Getters
|
|
const Slice& key() const { return key_; }
|
|
const Slice& value() const { return value_; }
|
|
const Status& status() const { return status_; }
|
|
const ParsedInternalKey& ikey() const { return ikey_; }
|
|
bool Valid() const { return valid_; }
|
|
const Slice& user_key() const { return current_user_key_; }
|
|
const CompactionIteratorStats& iter_stats() const { return iter_stats_; }
|
|
|
|
private:
|
|
// Processes the input stream to find the next output
|
|
void NextFromInput();
|
|
|
|
// Do last preparations before presenting the output to the callee. At this
|
|
// point this only zeroes out the sequence number if possible for better
|
|
// compression.
|
|
void PrepareOutput();
|
|
|
|
// Given a sequence number, return the sequence number of the
|
|
// earliest snapshot that this sequence number is visible in.
|
|
// The snapshots themselves are arranged in ascending order of
|
|
// sequence numbers.
|
|
// Employ a sequential search because the total number of
|
|
// snapshots are typically small.
|
|
inline SequenceNumber findEarliestVisibleSnapshot(
|
|
SequenceNumber in, SequenceNumber* prev_snapshot);
|
|
|
|
Iterator* input_;
|
|
const Comparator* cmp_;
|
|
MergeHelper* merge_helper_;
|
|
const std::vector<SequenceNumber>* snapshots_;
|
|
Env* env_;
|
|
bool expect_valid_internal_key_;
|
|
Statistics* stats_;
|
|
Compaction* compaction_;
|
|
const CompactionFilter* compaction_filter_;
|
|
LogBuffer* log_buffer_;
|
|
bool bottommost_level_;
|
|
bool valid_ = false;
|
|
SequenceNumber visible_at_tip_;
|
|
SequenceNumber earliest_snapshot_;
|
|
SequenceNumber latest_snapshot_;
|
|
|
|
// State
|
|
//
|
|
// Points to a copy of the current compaction iterator output (current_key_)
|
|
// if valid_.
|
|
Slice key_;
|
|
// Points to the value in the underlying iterator that corresponds to the
|
|
// current output.
|
|
Slice value_;
|
|
// The status is OK unless compaction iterator encounters a merge operand
|
|
// while not having a merge operator defined.
|
|
Status status_;
|
|
// Stores the user key, sequence number and type of the current compaction
|
|
// iterator output (or current key in the underlying iterator during
|
|
// NextFromInput()).
|
|
ParsedInternalKey ikey_;
|
|
// Stores whether ikey_.user_key is valid. If set to false, the user key is
|
|
// not compared against the current key in the underlying iterator.
|
|
bool has_current_user_key_ = false;
|
|
bool at_next_ = false; // If false, the iterator
|
|
// Holds a copy of the current compaction iterator output (or current key in
|
|
// the underlying iterator during NextFromInput()).
|
|
IterKey current_key_;
|
|
Slice current_user_key_;
|
|
SequenceNumber current_user_key_sequence_;
|
|
SequenceNumber current_user_key_snapshot_;
|
|
MergeOutputIterator merge_out_iter_;
|
|
std::string compaction_filter_value_;
|
|
// "level_ptrs" holds indices that remember which file of an associated
|
|
// level we were last checking during the last call to compaction->
|
|
// KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
|
|
// to pick off where it left off since each subcompaction's key range is
|
|
// increasing so a later call to the function must be looking for a key that
|
|
// is in or beyond the last file checked during the previous call
|
|
std::vector<size_t> level_ptrs_;
|
|
CompactionIteratorStats iter_stats_;
|
|
};
|
|
} // namespace rocksdb
|