mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-04 20:02:50 +00:00
f2546b6623
Summary: This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is: 1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned. 2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown. Needed follow up: 1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it. 2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion. 3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428 Test Plan: Added unit test Reviewed By: pdillinger Differential Revision: D54967067 Pulled By: jowlyzhang fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
243 lines
9.7 KiB
C++
243 lines
9.7 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#include <string>
|
|
|
|
#include "db/dbformat.h"
|
|
#include "file/readahead_file_info.h"
|
|
#include "rocksdb/comparator.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/status.h"
|
|
#include "table/format.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class PinnedIteratorsManager;
|
|
|
|
enum class IterBoundCheck : char {
|
|
kUnknown = 0,
|
|
kOutOfBound,
|
|
kInbound,
|
|
};
|
|
|
|
struct IterateResult {
|
|
Slice key;
|
|
IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
|
|
// If false, PrepareValue() needs to be called before value().
|
|
bool value_prepared = true;
|
|
};
|
|
|
|
template <class TValue>
|
|
class InternalIteratorBase : public Cleanable {
|
|
public:
|
|
InternalIteratorBase() {}
|
|
|
|
// No copying allowed
|
|
InternalIteratorBase(const InternalIteratorBase&) = delete;
|
|
InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
|
|
|
|
virtual ~InternalIteratorBase() {}
|
|
|
|
// This iterator will only process range tombstones with sequence
|
|
// number <= `read_seqno`.
|
|
// Noop for most child classes.
|
|
// For range tombstone iterators (TruncatedRangeDelIterator,
|
|
// FragmentedRangeTombstoneIterator), will only return range tombstones with
|
|
// sequence number <= `read_seqno`. For LevelIterator, it may open new table
|
|
// files and create new range tombstone iterators during scanning. It will use
|
|
// `read_seqno` as the sequence number for creating new range tombstone
|
|
// iterators.
|
|
virtual void SetRangeDelReadSeqno(SequenceNumber /* read_seqno */) {}
|
|
|
|
// An iterator is either positioned at a key/value pair, or
|
|
// not valid. This method returns true iff the iterator is valid.
|
|
// Always returns false if !status().ok().
|
|
virtual bool Valid() const = 0;
|
|
|
|
// Position at the first key in the source. The iterator is Valid()
|
|
// after this call iff the source is not empty.
|
|
virtual void SeekToFirst() = 0;
|
|
|
|
// Position at the last key in the source. The iterator is
|
|
// Valid() after this call iff the source is not empty.
|
|
virtual void SeekToLast() = 0;
|
|
|
|
// Position at the first key in the source that at or past target
|
|
// The iterator is Valid() after this call iff the source contains
|
|
// an entry that comes at or past target.
|
|
// All Seek*() methods clear any error status() that the iterator had prior to
|
|
// the call; after the seek, status() indicates only the error (if any) that
|
|
// happened during the seek, not any past errors.
|
|
// 'target' contains user timestamp if timestamp is enabled.
|
|
virtual void Seek(const Slice& target) = 0;
|
|
|
|
// Position at the first key in the source that at or before target
|
|
// The iterator is Valid() after this call iff the source contains
|
|
// an entry that comes at or before target.
|
|
virtual void SeekForPrev(const Slice& target) = 0;
|
|
|
|
// Moves to the next entry in the source. After this call, Valid() is
|
|
// true iff the iterator was not positioned at the last entry in the source.
|
|
// REQUIRES: Valid()
|
|
virtual void Next() = 0;
|
|
|
|
// Moves to the next entry in the source, and return result. Iterator
|
|
// implementation should override this method to help methods inline better,
|
|
// or when UpperBoundCheckResult() is non-trivial.
|
|
// REQUIRES: Valid()
|
|
virtual bool NextAndGetResult(IterateResult* result) {
|
|
Next();
|
|
bool is_valid = Valid();
|
|
if (is_valid) {
|
|
result->key = key();
|
|
// Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
|
|
// call. If an implementation has non-trivial UpperBoundCheckResult(),
|
|
// it should also override NextAndGetResult().
|
|
result->bound_check_result = IterBoundCheck::kUnknown;
|
|
result->value_prepared = false;
|
|
assert(UpperBoundCheckResult() != IterBoundCheck::kOutOfBound);
|
|
}
|
|
return is_valid;
|
|
}
|
|
|
|
// Moves to the previous entry in the source. After this call, Valid() is
|
|
// true iff the iterator was not positioned at the first entry in source.
|
|
// REQUIRES: Valid()
|
|
virtual void Prev() = 0;
|
|
|
|
// Return the key for the current entry. The underlying storage for
|
|
// the returned slice is valid only until the next modification of
|
|
// the iterator.
|
|
// REQUIRES: Valid()
|
|
virtual Slice key() const = 0;
|
|
|
|
// Returns the approximate write time of this entry, which is deduced from
|
|
// sequence number if sequence number to time mapping is available.
|
|
// The default implementation returns maximum uint64_t and that indicates the
|
|
// write time is unknown.
|
|
virtual uint64_t write_unix_time() const {
|
|
return std::numeric_limits<uint64_t>::max();
|
|
}
|
|
|
|
// Return user key for the current entry.
|
|
// REQUIRES: Valid()
|
|
virtual Slice user_key() const { return ExtractUserKey(key()); }
|
|
|
|
// Return the value for the current entry. The underlying storage for
|
|
// the returned slice is valid only until the next modification of
|
|
// the iterator.
|
|
// REQUIRES: Valid()
|
|
// REQUIRES: PrepareValue() has been called if needed (see PrepareValue()).
|
|
virtual TValue value() const = 0;
|
|
|
|
// If an error has occurred, return it. Else return an ok status.
|
|
// If non-blocking IO is requested and this operation cannot be
|
|
// satisfied without doing some IO, then this returns Status::Incomplete().
|
|
virtual Status status() const = 0;
|
|
|
|
// For some types of iterators, sometimes Seek()/Next()/SeekForPrev()/etc may
|
|
// load key but not value (to avoid the IO cost of reading the value from disk
|
|
// if it won't be not needed). This method loads the value in such situation.
|
|
//
|
|
// Needs to be called before value() at least once after each iterator
|
|
// movement (except if IterateResult::value_prepared = true), for iterators
|
|
// created with allow_unprepared_value = true.
|
|
//
|
|
// Returns false if an error occurred; in this case Valid() is also changed
|
|
// to false, and status() is changed to non-ok.
|
|
// REQUIRES: Valid()
|
|
virtual bool PrepareValue() { return true; }
|
|
|
|
// Keys return from this iterator can be smaller than iterate_lower_bound.
|
|
virtual bool MayBeOutOfLowerBound() { return true; }
|
|
|
|
// If the iterator has checked the key against iterate_upper_bound, returns
|
|
// the result here. The function can be used by user of the iterator to skip
|
|
// their own checks. If Valid() = true, IterBoundCheck::kUnknown is always
|
|
// a valid value. If Valid() = false, IterBoundCheck::kOutOfBound indicates
|
|
// that the iterator is filtered out by upper bound checks.
|
|
virtual IterBoundCheck UpperBoundCheckResult() {
|
|
return IterBoundCheck::kUnknown;
|
|
}
|
|
|
|
// Pass the PinnedIteratorsManager to the Iterator, most Iterators don't
|
|
// communicate with PinnedIteratorsManager so default implementation is no-op
|
|
// but for Iterators that need to communicate with PinnedIteratorsManager
|
|
// they will implement this function and use the passed pointer to communicate
|
|
// with PinnedIteratorsManager.
|
|
virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) {
|
|
}
|
|
|
|
// If true, this means that the Slice returned by key() is valid as long as
|
|
// PinnedIteratorsManager::ReleasePinnedData is not called and the
|
|
// Iterator is not deleted.
|
|
//
|
|
// IsKeyPinned() is guaranteed to always return true if
|
|
// - Iterator is created with ReadOptions::pin_data = true
|
|
// - DB tables were created with BlockBasedTableOptions::use_delta_encoding
|
|
// set to false.
|
|
virtual bool IsKeyPinned() const { return false; }
|
|
|
|
// If true, this means that the Slice returned by value() is valid as long as
|
|
// PinnedIteratorsManager::ReleasePinnedData is not called and the
|
|
// Iterator is not deleted.
|
|
// REQUIRES: Same as for value().
|
|
virtual bool IsValuePinned() const { return false; }
|
|
|
|
virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
|
|
return Status::NotSupported("");
|
|
}
|
|
|
|
// When iterator moves from one file to another file at same level, new file's
|
|
// readahead state (details of last block read) is updated with previous
|
|
// file's readahead state. This way internal readahead_size of Prefetch Buffer
|
|
// doesn't start from scratch and can fall back to 8KB with no prefetch if
|
|
// reads are not sequential.
|
|
//
|
|
// Default implementation is no-op and its implemented by iterators.
|
|
virtual void GetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
|
|
|
|
// Default implementation is no-op and its implemented by iterators.
|
|
virtual void SetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
|
|
|
|
// When used under merging iterator, LevelIterator treats file boundaries
|
|
// as sentinel keys to prevent it from moving to next SST file before range
|
|
// tombstones in the current SST file are no longer needed. This method makes
|
|
// it cheap to check if the current key is a sentinel key. This should only be
|
|
// used by MergingIterator and LevelIterator for now.
|
|
virtual bool IsDeleteRangeSentinelKey() const { return false; }
|
|
|
|
protected:
|
|
void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) {
|
|
Seek(target);
|
|
if (!Valid()) {
|
|
SeekToLast();
|
|
}
|
|
while (Valid() && cmp->Compare(target, key()) < 0) {
|
|
Prev();
|
|
}
|
|
}
|
|
};
|
|
|
|
using InternalIterator = InternalIteratorBase<Slice>;
|
|
|
|
// Return an empty iterator (yields nothing).
|
|
template <class TValue = Slice>
|
|
InternalIteratorBase<TValue>* NewEmptyInternalIterator();
|
|
|
|
// Return an empty iterator with the specified status.
|
|
template <class TValue = Slice>
|
|
InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status);
|
|
|
|
// Return an empty iterator with the specified status, allocated arena.
|
|
template <class TValue = Slice>
|
|
InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
|
|
Arena* arena);
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|