rocksdb/table/iterator_wrapper.h
Yu Zhang f2546b6623 Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.

2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in  https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.

Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.

2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.

3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428

Test Plan: Added unit test

Reviewed By: pdillinger

Differential Revision: D54967067

Pulled By: jowlyzhang

fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 15:37:37 -07:00

226 lines
5.6 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <set>
#include "table/internal_iterator.h"
#include "test_util/sync_point.h"
namespace ROCKSDB_NAMESPACE {
// A internal wrapper class with an interface similar to Iterator that caches
// the valid() and key() results for an underlying iterator.
// This can help avoid virtual function calls and also gives better
// cache locality.
template <class TValue = Slice>
class IteratorWrapperBase {
public:
IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
: iter_(nullptr) {
Set(_iter);
}
~IteratorWrapperBase() {}
InternalIteratorBase<TValue>* iter() const { return iter_; }
void SetRangeDelReadSeqno(SequenceNumber read_seqno) {
if (iter_) {
iter_->SetRangeDelReadSeqno(read_seqno);
}
}
// Set the underlying Iterator to _iter and return
// previous underlying Iterator.
InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
InternalIteratorBase<TValue>* old_iter = iter_;
iter_ = _iter;
if (iter_ == nullptr) {
valid_ = false;
} else {
Update();
}
return old_iter;
}
void DeleteIter(bool is_arena_mode) {
if (iter_) {
#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
if (!status_checked_after_invalid_) {
// If this assertion fails, it is likely that you did not check
// iterator status after Valid() returns false.
fprintf(stderr,
"Failed to check status after Valid() returned false from this "
"iterator.\n");
port::PrintStack();
std::abort();
}
#endif
if (!is_arena_mode) {
delete iter_;
} else {
iter_->~InternalIteratorBase<TValue>();
}
}
}
// Iterator interface methods
bool Valid() const {
#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
status_checked_after_invalid_ = valid_;
#endif
return valid_;
}
Slice key() const {
assert(Valid());
return result_.key;
}
uint64_t write_unix_time() const {
assert(Valid());
return iter_->write_unix_time();
}
TValue value() const {
assert(Valid());
return iter_->value();
}
// Methods below require iter() != nullptr
Status status() const {
#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
status_checked_after_invalid_ = true;
#endif
assert(iter_);
return iter_->status();
}
bool PrepareValue() {
assert(Valid());
if (result_.value_prepared) {
return true;
}
if (iter_->PrepareValue()) {
result_.value_prepared = true;
result_.key = iter_->key();
return true;
}
assert(!iter_->Valid());
valid_ = false;
return false;
}
void Next() {
assert(iter_);
valid_ = iter_->NextAndGetResult(&result_);
assert(!valid_ || iter_->status().ok());
}
bool NextAndGetResult(IterateResult* result) {
assert(iter_);
valid_ = iter_->NextAndGetResult(&result_);
*result = result_;
assert(!valid_ || iter_->status().ok());
return valid_;
}
void Prev() {
assert(iter_);
iter_->Prev();
Update();
}
void Seek(const Slice& k) {
assert(iter_);
iter_->Seek(k);
Update();
}
void SeekForPrev(const Slice& k) {
assert(iter_);
iter_->SeekForPrev(k);
Update();
}
void SeekToFirst() {
assert(iter_);
iter_->SeekToFirst();
Update();
}
void SeekToLast() {
assert(iter_);
iter_->SeekToLast();
Update();
}
bool MayBeOutOfLowerBound() {
assert(Valid());
return iter_->MayBeOutOfLowerBound();
}
IterBoundCheck UpperBoundCheckResult() {
assert(Valid());
return result_.bound_check_result;
}
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
assert(iter_);
iter_->SetPinnedItersMgr(pinned_iters_mgr);
}
bool IsKeyPinned() const {
assert(Valid());
return iter_->IsKeyPinned();
}
bool IsValuePinned() const {
assert(Valid());
return iter_->IsValuePinned();
}
bool IsValuePrepared() const { return result_.value_prepared; }
Slice user_key() const {
assert(Valid());
return iter_->user_key();
}
void UpdateReadaheadState(InternalIteratorBase<TValue>* old_iter) {
if (old_iter && iter_) {
ReadaheadFileInfo readahead_file_info;
old_iter->GetReadaheadState(&readahead_file_info);
iter_->SetReadaheadState(&readahead_file_info);
}
}
bool IsDeleteRangeSentinelKey() const {
return iter_->IsDeleteRangeSentinelKey();
}
private:
void Update() {
valid_ = iter_->Valid();
if (valid_) {
assert(iter_->status().ok());
result_.key = iter_->key();
result_.bound_check_result = IterBoundCheck::kUnknown;
result_.value_prepared = false;
}
}
InternalIteratorBase<TValue>* iter_;
IterateResult result_;
bool valid_;
#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
mutable bool status_checked_after_invalid_ = true;
#endif
};
using IteratorWrapper = IteratorWrapperBase<Slice>;
class Arena;
// Return an empty iterator (yields nothing) allocated from arena.
template <class TValue = Slice>
InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
} // namespace ROCKSDB_NAMESPACE