mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 11:43:49 +00:00
9f2363f4c4
Summary: Add user-defined timestamp support for range deletion. The new API is `DeleteRange(opt, cf, begin_key, end_key, ts)`. Most of the change is to update the comparator to compare without timestamp. Other than that, major changes are - internal range tombstone data structures (`FragmentedRangeTombstoneList`, `RangeTombstone`, etc.) to store timestamps. - Garbage collection of range tombstones and range tombstone covered keys during compaction. - Get()/MultiGet() to return the timestamp of a range tombstone when needed. - Get/Iterator with range tombstones bounded by readoptions.timestamp. - timestamp crash test now issues DeleteRange by default. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10661 Test Plan: - Added unit test: `make check` - Stress test: `python3 tools/db_crashtest.py --enable_ts whitebox --readpercent=57 --prefixpercent=4 --writepercent=25 -delpercent=5 --iterpercent=5 --delrangepercent=4` - Ran `db_bench` to measure regression when timestamp is not enabled. The tests are for write (with some range deletion) and iterate with DB fitting in memory: `./db_bench--benchmarks=fillrandom,seekrandom --writes_per_range_tombstone=200 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=500000 --reads=500000 --seek_nexts=10 --disable_auto_compactions -disable_wal=true --max_num_range_tombstones=1000`. Did not see consistent regression in no timestamp case. | micros/op | fillrandom | seekrandom | | --- | --- | --- | |main| 2.58 |10.96| |PR 10661| 2.68 |10.63| Reviewed By: riversand963 Differential Revision: D39441192 Pulled By: cbi42 fbshipit-source-id: f05aca3c41605caf110daf0ff405919f300ddec2
233 lines
9.2 KiB
C++
233 lines
9.2 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
#include <string>
|
|
|
|
#include "db/read_callback.h"
|
|
#include "rocksdb/types.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
class BlobFetcher;
|
|
class Comparator;
|
|
class Logger;
|
|
class MergeContext;
|
|
class MergeOperator;
|
|
class PinnableWideColumns;
|
|
class PinnedIteratorsManager;
|
|
class Statistics;
|
|
class SystemClock;
|
|
struct ParsedInternalKey;
|
|
|
|
// Data structure for accumulating statistics during a point lookup. At the
|
|
// end of the point lookup, the corresponding ticker stats are updated. This
|
|
// avoids the overhead of frequent ticker stats updates
|
|
struct GetContextStats {
|
|
uint64_t num_cache_hit = 0;
|
|
uint64_t num_cache_index_hit = 0;
|
|
uint64_t num_cache_data_hit = 0;
|
|
uint64_t num_cache_filter_hit = 0;
|
|
uint64_t num_cache_compression_dict_hit = 0;
|
|
uint64_t num_cache_index_miss = 0;
|
|
uint64_t num_cache_filter_miss = 0;
|
|
uint64_t num_cache_data_miss = 0;
|
|
uint64_t num_cache_compression_dict_miss = 0;
|
|
uint64_t num_cache_bytes_read = 0;
|
|
uint64_t num_cache_miss = 0;
|
|
uint64_t num_cache_add = 0;
|
|
uint64_t num_cache_add_redundant = 0;
|
|
uint64_t num_cache_bytes_write = 0;
|
|
uint64_t num_cache_index_add = 0;
|
|
uint64_t num_cache_index_add_redundant = 0;
|
|
uint64_t num_cache_index_bytes_insert = 0;
|
|
uint64_t num_cache_data_add = 0;
|
|
uint64_t num_cache_data_add_redundant = 0;
|
|
uint64_t num_cache_data_bytes_insert = 0;
|
|
uint64_t num_cache_filter_add = 0;
|
|
uint64_t num_cache_filter_add_redundant = 0;
|
|
uint64_t num_cache_filter_bytes_insert = 0;
|
|
uint64_t num_cache_compression_dict_add = 0;
|
|
uint64_t num_cache_compression_dict_add_redundant = 0;
|
|
uint64_t num_cache_compression_dict_bytes_insert = 0;
|
|
// MultiGet stats.
|
|
uint64_t num_filter_read = 0;
|
|
uint64_t num_index_read = 0;
|
|
uint64_t num_sst_read = 0;
|
|
};
|
|
|
|
// A class to hold context about a point lookup, such as pointer to value
|
|
// slice, key, merge context etc, as well as the current state of the
|
|
// lookup. Any user using GetContext to track the lookup result must call
|
|
// SaveValue() whenever the internal key is found. This can happen
|
|
// repeatedly in case of merge operands. In case the key may exist with
|
|
// high probability, but IO is required to confirm and the user doesn't allow
|
|
// it, MarkKeyMayExist() must be called instead of SaveValue().
|
|
class GetContext {
|
|
public:
|
|
// Current state of the point lookup. All except kNotFound and kMerge are
|
|
// terminal states
|
|
enum GetState {
|
|
kNotFound,
|
|
kFound,
|
|
kDeleted,
|
|
kCorrupt,
|
|
kMerge, // saver contains the current merge result (the operands)
|
|
kUnexpectedBlobIndex,
|
|
// TODO: remove once wide-column entities are supported by Get/MultiGet
|
|
kUnexpectedWideColumnEntity,
|
|
};
|
|
GetContextStats get_context_stats_;
|
|
|
|
// Constructor
|
|
// @param value Holds the value corresponding to user_key. If its nullptr
|
|
// then return all merge operands corresponding to user_key
|
|
// via merge_context
|
|
// @param value_found If non-nullptr, set to false if key may be present
|
|
// but we can't be certain because we cannot do IO
|
|
// @param max_covering_tombstone_seq Pointer to highest sequence number of
|
|
// range deletion covering the key. When an internal key
|
|
// is found with smaller sequence number, the lookup
|
|
// terminates
|
|
// @param seq If non-nullptr, the sequence number of the found key will be
|
|
// saved here
|
|
// @param callback Pointer to ReadCallback to perform additional checks
|
|
// for visibility of a key
|
|
// @param is_blob_index If non-nullptr, will be used to indicate if a found
|
|
// key is of type blob index
|
|
// @param do_merge True if value associated with user_key has to be returned
|
|
// and false if all the merge operands associated with user_key has to be
|
|
// returned. Id do_merge=false then all the merge operands are stored in
|
|
// merge_context and they are never merged. The value pointer is untouched.
|
|
GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
|
|
Logger* logger, Statistics* statistics, GetState init_state,
|
|
const Slice& user_key, PinnableSlice* value,
|
|
PinnableWideColumns* columns, bool* value_found,
|
|
MergeContext* merge_context, bool do_merge,
|
|
SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
|
|
SequenceNumber* seq = nullptr,
|
|
PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
|
|
ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
|
|
uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
|
|
GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
|
|
Logger* logger, Statistics* statistics, GetState init_state,
|
|
const Slice& user_key, PinnableSlice* value,
|
|
PinnableWideColumns* columns, std::string* timestamp,
|
|
bool* value_found, MergeContext* merge_context, bool do_merge,
|
|
SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
|
|
SequenceNumber* seq = nullptr,
|
|
PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
|
|
ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
|
|
uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
|
|
|
|
GetContext() = delete;
|
|
|
|
// This can be called to indicate that a key may be present, but cannot be
|
|
// confirmed due to IO not allowed
|
|
void MarkKeyMayExist();
|
|
|
|
// Records this key, value, and any meta-data (such as sequence number and
|
|
// state) into this GetContext.
|
|
//
|
|
// If the parsed_key matches the user key that we are looking for, sets
|
|
// matched to true.
|
|
//
|
|
// Returns True if more keys need to be read (due to merges) or
|
|
// False if the complete value has been found.
|
|
bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value,
|
|
bool* matched, Cleanable* value_pinner = nullptr);
|
|
|
|
// Simplified version of the previous function. Should only be used when we
|
|
// know that the operation is a Put.
|
|
void SaveValue(const Slice& value, SequenceNumber seq);
|
|
|
|
GetState State() const { return state_; }
|
|
|
|
SequenceNumber* max_covering_tombstone_seq() {
|
|
return max_covering_tombstone_seq_;
|
|
}
|
|
|
|
bool NeedTimestamp() { return timestamp_ != nullptr; }
|
|
|
|
void SetTimestampFromRangeTombstone(const Slice& timestamp) {
|
|
assert(timestamp_);
|
|
timestamp_->assign(timestamp.data(), timestamp.size());
|
|
ts_from_rangetombstone_ = true;
|
|
}
|
|
|
|
PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; }
|
|
|
|
// If a non-null string is passed, all the SaveValue calls will be
|
|
// logged into the string. The operations can then be replayed on
|
|
// another GetContext with replayGetContextLog.
|
|
void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; }
|
|
|
|
// Do we need to fetch the SequenceNumber for this key?
|
|
bool NeedToReadSequence() const { return (seq_ != nullptr); }
|
|
|
|
bool sample() const { return sample_; }
|
|
|
|
bool CheckCallback(SequenceNumber seq) {
|
|
if (callback_) {
|
|
return callback_->IsVisible(seq);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void ReportCounters();
|
|
|
|
bool has_callback() const { return callback_ != nullptr; }
|
|
|
|
uint64_t get_tracing_get_id() const { return tracing_get_id_; }
|
|
|
|
void push_operand(const Slice& value, Cleanable* value_pinner);
|
|
|
|
private:
|
|
void Merge(const Slice* value);
|
|
bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value);
|
|
|
|
const Comparator* ucmp_;
|
|
const MergeOperator* merge_operator_;
|
|
// the merge operations encountered;
|
|
Logger* logger_;
|
|
Statistics* statistics_;
|
|
|
|
GetState state_;
|
|
Slice user_key_;
|
|
PinnableSlice* pinnable_val_;
|
|
PinnableWideColumns* columns_;
|
|
std::string* timestamp_;
|
|
bool ts_from_rangetombstone_{false};
|
|
bool* value_found_; // Is value set correctly? Used by KeyMayExist
|
|
MergeContext* merge_context_;
|
|
SequenceNumber* max_covering_tombstone_seq_;
|
|
SystemClock* clock_;
|
|
// If a key is found, seq_ will be set to the SequenceNumber of most recent
|
|
// write to the key or kMaxSequenceNumber if unknown
|
|
SequenceNumber* seq_;
|
|
std::string* replay_log_;
|
|
// Used to temporarily pin blocks when state_ == GetContext::kMerge
|
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
|
ReadCallback* callback_;
|
|
bool sample_;
|
|
// Value is true if it's called as part of DB Get API and false if it's
|
|
// called as part of DB GetMergeOperands API. When it's false merge operators
|
|
// are never merged.
|
|
bool do_merge_;
|
|
bool* is_blob_index_;
|
|
// Used for block cache tracing only. A tracing get id uniquely identifies a
|
|
// Get or a MultiGet.
|
|
const uint64_t tracing_get_id_;
|
|
BlobFetcher* blob_fetcher_;
|
|
};
|
|
|
|
// Call this to replay a log and bring the get_context up to date. The replay
|
|
// log must have been created by another GetContext object, whose replay log
|
|
// must have been set by calling GetContext::SetReplayLog().
|
|
void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
|
|
GetContext* get_context,
|
|
Cleanable* value_pinner = nullptr);
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|