mirror of https://github.com/facebook/rocksdb.git
Support returning write unix time in iterator property (#12428)
Summary: This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is: 1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned. 2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown. Needed follow up: 1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it. 2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion. 3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428 Test Plan: Added unit test Reviewed By: pdillinger Differential Revision: D54967067 Pulled By: jowlyzhang fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
This commit is contained in:
parent
4d5ebad971
commit
f2546b6623
|
@ -477,13 +477,16 @@ void SuperVersion::Cleanup() {
|
|||
cfd->UnrefAndTryDelete();
|
||||
}
|
||||
|
||||
void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
|
||||
MemTableListVersion* new_imm, Version* new_current) {
|
||||
void SuperVersion::Init(
|
||||
ColumnFamilyData* new_cfd, MemTable* new_mem, MemTableListVersion* new_imm,
|
||||
Version* new_current,
|
||||
std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping) {
|
||||
cfd = new_cfd;
|
||||
mem = new_mem;
|
||||
imm = new_imm;
|
||||
current = new_current;
|
||||
full_history_ts_low = cfd->GetFullHistoryTsLow();
|
||||
seqno_to_time_mapping = std::move(new_seqno_to_time_mapping);
|
||||
cfd->Ref();
|
||||
mem->Ref();
|
||||
imm->Ref();
|
||||
|
@ -1196,9 +1199,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
|
|||
ReadOptions read_opts;
|
||||
read_opts.total_order_seek = true;
|
||||
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
|
||||
merge_iter_builder.AddIterator(
|
||||
super_version->mem->NewIterator(read_opts, &arena));
|
||||
super_version->imm->AddIterators(read_opts, &merge_iter_builder,
|
||||
merge_iter_builder.AddIterator(super_version->mem->NewIterator(
|
||||
read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
|
||||
super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
|
||||
&merge_iter_builder,
|
||||
false /* add_range_tombstone_iter */);
|
||||
ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
|
||||
|
||||
|
@ -1336,7 +1340,12 @@ void ColumnFamilyData::InstallSuperVersion(
|
|||
const MutableCFOptions& mutable_cf_options) {
|
||||
SuperVersion* new_superversion = sv_context->new_superversion.release();
|
||||
new_superversion->mutable_cf_options = mutable_cf_options;
|
||||
new_superversion->Init(this, mem_, imm_.current(), current_);
|
||||
new_superversion->Init(this, mem_, imm_.current(), current_,
|
||||
sv_context->new_seqno_to_time_mapping
|
||||
? std::move(sv_context->new_seqno_to_time_mapping)
|
||||
: super_version_
|
||||
? super_version_->ShareSeqnoToTimeMapping()
|
||||
: nullptr);
|
||||
SuperVersion* old_superversion = super_version_;
|
||||
super_version_ = new_superversion;
|
||||
if (old_superversion == nullptr || old_superversion->current != current() ||
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "trace_replay/block_cache_tracer.h"
|
||||
#include "util/cast_util.h"
|
||||
#include "util/hash_containers.h"
|
||||
#include "util/thread_local.h"
|
||||
|
||||
|
@ -219,6 +220,9 @@ struct SuperVersion {
|
|||
// enable UDT feature, this is an empty string.
|
||||
std::string full_history_ts_low;
|
||||
|
||||
// A shared copy of the DB's seqno to time mapping.
|
||||
std::shared_ptr<const SeqnoToTimeMapping> seqno_to_time_mapping{nullptr};
|
||||
|
||||
// should be called outside the mutex
|
||||
SuperVersion() = default;
|
||||
~SuperVersion();
|
||||
|
@ -232,8 +236,23 @@ struct SuperVersion {
|
|||
// that needs to be deleted in to_delete vector. Unrefing those
|
||||
// objects needs to be done in the mutex
|
||||
void Cleanup();
|
||||
void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
|
||||
MemTableListVersion* new_imm, Version* new_current);
|
||||
void Init(
|
||||
ColumnFamilyData* new_cfd, MemTable* new_mem,
|
||||
MemTableListVersion* new_imm, Version* new_current,
|
||||
std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping);
|
||||
|
||||
// Share the ownership of the seqno to time mapping object referred to in this
|
||||
// SuperVersion. To be used by the new SuperVersion to be installed after this
|
||||
// one if seqno to time mapping does not change in between these two
|
||||
// SuperVersions.
|
||||
std::shared_ptr<const SeqnoToTimeMapping> ShareSeqnoToTimeMapping() {
|
||||
return seqno_to_time_mapping;
|
||||
}
|
||||
|
||||
// Access the seqno to time mapping object in this SuperVersion.
|
||||
UnownedPtr<const SeqnoToTimeMapping> GetSeqnoToTimeMapping() const {
|
||||
return seqno_to_time_mapping.get();
|
||||
}
|
||||
|
||||
// The value of dummy is not actually used. kSVInUse takes its address as a
|
||||
// mark in the thread local storage to indicate the SuperVersion is in use
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "rocksdb/listener.h"
|
||||
#include "rocksdb/utilities/debug.h"
|
||||
#include "test_util/mock_time_env.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
|
@ -1307,8 +1308,8 @@ TEST_F(TieredCompactionTest, CheckInternalKeyRange) {
|
|||
|
||||
class PrecludeLastLevelTest : public DBTestBase {
|
||||
public:
|
||||
PrecludeLastLevelTest()
|
||||
: DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) {
|
||||
PrecludeLastLevelTest(std::string test_name = "preclude_last_level_test")
|
||||
: DBTestBase(test_name, /*env_do_fsync=*/false) {
|
||||
mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
|
||||
mock_clock_->SetCurrentTime(kMockStartTime);
|
||||
mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
|
||||
|
@ -2256,6 +2257,253 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
|
|||
Close();
|
||||
}
|
||||
|
||||
// Tests DBIter::GetProperty("rocksdb.iterator.write-time") return a data's
|
||||
// approximate write unix time.
|
||||
// Test Param:
|
||||
// 1) use tailing iterator or regular iterator (when it applies)
|
||||
class IteratorWriteTimeTest : public PrecludeLastLevelTest,
|
||||
public testing::WithParamInterface<bool> {
|
||||
public:
|
||||
IteratorWriteTimeTest() : PrecludeLastLevelTest("iterator_write_time_test") {}
|
||||
|
||||
uint64_t VerifyKeyAndGetWriteTime(Iterator* iter,
|
||||
const std::string& expected_key) {
|
||||
std::string prop;
|
||||
uint64_t write_time = 0;
|
||||
EXPECT_TRUE(iter->Valid());
|
||||
EXPECT_EQ(expected_key, iter->key());
|
||||
EXPECT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop));
|
||||
Slice prop_slice = prop;
|
||||
EXPECT_TRUE(GetFixed64(&prop_slice, &write_time));
|
||||
return write_time;
|
||||
}
|
||||
|
||||
void VerifyKeyAndWriteTime(Iterator* iter, const std::string& expected_key,
|
||||
uint64_t expected_write_time) {
|
||||
std::string prop;
|
||||
uint64_t write_time = 0;
|
||||
EXPECT_TRUE(iter->Valid());
|
||||
EXPECT_EQ(expected_key, iter->key());
|
||||
EXPECT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop));
|
||||
Slice prop_slice = prop;
|
||||
EXPECT_TRUE(GetFixed64(&prop_slice, &write_time));
|
||||
EXPECT_EQ(expected_write_time, write_time);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(IteratorWriteTimeTest, ReadFromMemtables) {
|
||||
const int kNumTrigger = 4;
|
||||
const int kNumLevels = 7;
|
||||
const int kNumKeys = 100;
|
||||
const int kSecondsPerRecording = 101;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.compaction_style = kCompactionStyleUniversal;
|
||||
options.env = mock_env_.get();
|
||||
options.level0_file_num_compaction_trigger = kNumTrigger;
|
||||
options.preserve_internal_time_seconds = 10000;
|
||||
options.num_levels = kNumLevels;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
Random rnd(301);
|
||||
for (int i = 0; i < kNumKeys; i++) {
|
||||
dbfull()->TEST_WaitForPeriodicTaskRun(
|
||||
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
|
||||
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
|
||||
}
|
||||
|
||||
ReadOptions ropts;
|
||||
ropts.tailing = GetParam();
|
||||
int i;
|
||||
|
||||
// Forward iteration
|
||||
uint64_t start_time = 0;
|
||||
{
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
for (iter->SeekToFirst(), i = 0; iter->Valid(); iter->Next(), i++) {
|
||||
if (start_time == 0) {
|
||||
start_time = VerifyKeyAndGetWriteTime(iter.get(), Key(i));
|
||||
} else {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i),
|
||||
start_time + kSecondsPerRecording * (i + 1));
|
||||
}
|
||||
}
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
// Backward iteration
|
||||
{
|
||||
ropts.tailing = false;
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
for (iter->SeekToLast(), i = kNumKeys - 1; iter->Valid();
|
||||
iter->Prev(), i--) {
|
||||
if (i == 0) {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i), start_time);
|
||||
} else {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i),
|
||||
start_time + kSecondsPerRecording * (i + 1));
|
||||
}
|
||||
}
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
|
||||
const int kNumTrigger = 4;
|
||||
const int kNumLevels = 7;
|
||||
const int kNumKeys = 100;
|
||||
const int kSecondsPerRecording = 101;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.compaction_style = kCompactionStyleUniversal;
|
||||
options.env = mock_env_.get();
|
||||
options.level0_file_num_compaction_trigger = kNumTrigger;
|
||||
options.preserve_internal_time_seconds = 10000;
|
||||
options.num_levels = kNumLevels;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
Random rnd(301);
|
||||
for (int i = 0; i < kNumKeys; i++) {
|
||||
dbfull()->TEST_WaitForPeriodicTaskRun(
|
||||
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
|
||||
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
|
||||
}
|
||||
|
||||
ASSERT_OK(Flush());
|
||||
ReadOptions ropts;
|
||||
ropts.tailing = GetParam();
|
||||
std::string prop;
|
||||
int i;
|
||||
|
||||
// Forward iteration
|
||||
uint64_t start_time = 0;
|
||||
{
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
for (iter->SeekToFirst(), i = 0; iter->Valid(); iter->Next(), i++) {
|
||||
if (start_time == 0) {
|
||||
start_time = VerifyKeyAndGetWriteTime(iter.get(), Key(i));
|
||||
} else {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i),
|
||||
start_time + kSecondsPerRecording * (i + 1));
|
||||
}
|
||||
}
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
// Backward iteration
|
||||
{
|
||||
ropts.tailing = false;
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
for (iter->SeekToLast(), i = kNumKeys - 1; iter->Valid();
|
||||
iter->Prev(), i--) {
|
||||
if (i == 0) {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i), start_time);
|
||||
} else {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i),
|
||||
start_time + kSecondsPerRecording * (i + 1));
|
||||
}
|
||||
}
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
// Reopen the DB and disable the seqno to time recording. Data retrieved from
|
||||
// SST files still have write time available.
|
||||
options.preserve_internal_time_seconds = 0;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
dbfull()->TEST_WaitForPeriodicTaskRun(
|
||||
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
|
||||
ASSERT_OK(Put("a", "val"));
|
||||
ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
|
||||
|
||||
{
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
iter->SeekToFirst();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
// "a" is retrieved from memtable, its write time is unknown because the
|
||||
// seqno to time mapping recording is not available.
|
||||
VerifyKeyAndWriteTime(iter.get(), "a",
|
||||
std::numeric_limits<uint64_t>::max());
|
||||
for (iter->Next(), i = 0; iter->Valid(); iter->Next(), i++) {
|
||||
if (i == 0) {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i), start_time);
|
||||
} else {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i),
|
||||
start_time + kSecondsPerRecording * (i + 1));
|
||||
}
|
||||
}
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
// There is no write time info for "a" after it's flushed to SST file either.
|
||||
ASSERT_OK(Flush());
|
||||
{
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
iter->SeekToFirst();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
VerifyKeyAndWriteTime(iter.get(), "a",
|
||||
std::numeric_limits<uint64_t>::max());
|
||||
}
|
||||
|
||||
// Sequence number zeroed out after compacted to the last level, write time
|
||||
// all becomes zero.
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
{
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
iter->SeekToFirst();
|
||||
for (iter->Next(), i = 0; iter->Valid(); iter->Next(), i++) {
|
||||
VerifyKeyAndWriteTime(iter.get(), Key(i), 0);
|
||||
}
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST_P(IteratorWriteTimeTest, MergeReturnsBaseValueWriteTime) {
|
||||
const int kNumTrigger = 4;
|
||||
const int kNumLevels = 7;
|
||||
const int kSecondsPerRecording = 101;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.compaction_style = kCompactionStyleUniversal;
|
||||
options.env = mock_env_.get();
|
||||
options.level0_file_num_compaction_trigger = kNumTrigger;
|
||||
options.preserve_internal_time_seconds = 10000;
|
||||
options.num_levels = kNumLevels;
|
||||
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||
DestroyAndReopen(options);
|
||||
|
||||
dbfull()->TEST_WaitForPeriodicTaskRun(
|
||||
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
|
||||
ASSERT_OK(Put("foo", "fv1"));
|
||||
|
||||
dbfull()->TEST_WaitForPeriodicTaskRun(
|
||||
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
|
||||
ASSERT_OK(Put("bar", "bv1"));
|
||||
ASSERT_OK(Merge("foo", "bv1"));
|
||||
|
||||
ReadOptions ropts;
|
||||
ropts.tailing = GetParam();
|
||||
{
|
||||
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
|
||||
iter->SeekToFirst();
|
||||
uint64_t bar_time = VerifyKeyAndGetWriteTime(iter.get(), "bar");
|
||||
iter->Next();
|
||||
uint64_t foo_time = VerifyKeyAndGetWriteTime(iter.get(), "foo");
|
||||
// "foo" has an older write time because its base value's write time is used
|
||||
ASSERT_GT(bar_time, foo_time);
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
Close();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(IteratorWriteTimeTest, IteratorWriteTimeTest,
|
||||
testing::Bool());
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <cinttypes>
|
||||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
@ -1979,7 +1980,8 @@ InternalIterator* DBImpl::NewInternalIterator(
|
|||
super_version->mutable_cf_options.prefix_extractor != nullptr,
|
||||
read_options.iterate_upper_bound);
|
||||
// Collect iterator for mutable memtable
|
||||
auto mem_iter = super_version->mem->NewIterator(read_options, arena);
|
||||
auto mem_iter = super_version->mem->NewIterator(
|
||||
read_options, super_version->GetSeqnoToTimeMapping(), arena);
|
||||
Status s;
|
||||
if (!read_options.ignore_range_deletions) {
|
||||
TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
|
||||
|
@ -2001,8 +2003,9 @@ InternalIterator* DBImpl::NewInternalIterator(
|
|||
|
||||
// Collect all needed child iterators for immutable memtables
|
||||
if (s.ok()) {
|
||||
super_version->imm->AddIterators(read_options, &merge_iter_builder,
|
||||
!read_options.ignore_range_deletions);
|
||||
super_version->imm->AddIterators(
|
||||
read_options, super_version->GetSeqnoToTimeMapping(),
|
||||
&merge_iter_builder, !read_options.ignore_range_deletions);
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
|
||||
if (s.ok()) {
|
||||
|
@ -6466,6 +6469,8 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
|
|||
immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
|
||||
.PermitUncheckedError(); // Ignore error
|
||||
uint64_t unix_time = static_cast<uint64_t>(unix_time_signed);
|
||||
|
||||
std::vector<SuperVersionContext> sv_contexts;
|
||||
if (populate_historical_seconds > 0) {
|
||||
bool success = true;
|
||||
{
|
||||
|
@ -6476,6 +6481,7 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
|
|||
success = seqno_to_time_mapping_.PrePopulate(
|
||||
from_seqno, seqno, unix_time - populate_historical_seconds,
|
||||
unix_time);
|
||||
InstallSeqnoToTimeMappingInSV(&sv_contexts);
|
||||
} else {
|
||||
// One of these will fail
|
||||
assert(seqno > 1);
|
||||
|
@ -6501,7 +6507,31 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
|
|||
// FIXME: assert(seqno > 0);
|
||||
// Always successful assuming seqno never go backwards
|
||||
seqno_to_time_mapping_.Append(seqno, unix_time);
|
||||
InstallSeqnoToTimeMappingInSV(&sv_contexts);
|
||||
}
|
||||
|
||||
// clean up outside db mutex
|
||||
for (SuperVersionContext& sv_context : sv_contexts) {
|
||||
sv_context.Clean();
|
||||
}
|
||||
}
|
||||
|
||||
void DBImpl::InstallSeqnoToTimeMappingInSV(
|
||||
std::vector<SuperVersionContext>* sv_contexts) {
|
||||
mutex_.AssertHeld();
|
||||
std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
|
||||
std::make_shared<SeqnoToTimeMapping>();
|
||||
new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
|
||||
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
|
||||
if (cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
sv_contexts->emplace_back(/*create_superversion=*/true);
|
||||
sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping;
|
||||
cfd->InstallSuperVersion(&sv_contexts->back(),
|
||||
*(cfd->GetLatestMutableCFOptions()));
|
||||
}
|
||||
bg_cv_.SignalAll();
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -1226,6 +1226,22 @@ class DBImpl : public DB {
|
|||
// populate_historical_seconds, now].
|
||||
void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds);
|
||||
|
||||
// Everytime DB's seqno to time mapping changed (which already hold the db
|
||||
// mutex), we install a new SuperVersion in each column family with a shared
|
||||
// copy of the new mapping while holding the db mutex.
|
||||
// This is done for all column families even though the column family does not
|
||||
// explicitly enabled the
|
||||
// `preclude_last_level_data_seconds` or `preserve_internal_time_seconds`
|
||||
// features.
|
||||
// This mapping supports iterators to fulfill the
|
||||
// "rocksdb.iterator.write-time" iterator property for entries in memtables.
|
||||
//
|
||||
// Since this new SuperVersion doesn't involve an LSM tree shape change, we
|
||||
// don't schedule work after installing this SuperVersion. It returns the used
|
||||
// `SuperVersionContext` for clean up after release mutex.
|
||||
void InstallSeqnoToTimeMappingInSV(
|
||||
std::vector<SuperVersionContext>* sv_contexts);
|
||||
|
||||
// Interface to block and signal the DB in case of stalling writes by
|
||||
// WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
|
||||
// When DB needs to be blocked or signalled by WriteBufferManager,
|
||||
|
|
|
@ -1630,7 +1630,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|||
Status s;
|
||||
TableProperties table_properties;
|
||||
{
|
||||
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
|
||||
ScopedArenaIterator iter(
|
||||
mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
|
||||
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
|
||||
"[%s] [WriteLevel0TableForRecovery]"
|
||||
" Level-0 table #%" PRIu64 ": started",
|
||||
|
|
|
@ -113,8 +113,8 @@ Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
|
|||
*prop = saved_key_.GetUserKey().ToString();
|
||||
return Status::OK();
|
||||
} else if (prop_name == "rocksdb.iterator.write-time") {
|
||||
// TODO(yuzhangyu): implement return the actual write time.
|
||||
return Status::NotSupported("write time property is under construction");
|
||||
PutFixed64(prop, saved_write_unix_time_);
|
||||
return Status::OK();
|
||||
}
|
||||
return Status::InvalidArgument("Unidentified property.");
|
||||
}
|
||||
|
@ -421,6 +421,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
|
|||
assert(ikey_.type == kTypeValue ||
|
||||
ikey_.type == kTypeValuePreferredSeqno);
|
||||
Slice value = iter_.value();
|
||||
saved_write_unix_time_ = iter_.write_unix_time();
|
||||
if (ikey_.type == kTypeValuePreferredSeqno) {
|
||||
value = ParsePackedValueForValue(value);
|
||||
}
|
||||
|
@ -582,6 +583,7 @@ bool DBIter::MergeValuesNewToOld() {
|
|||
|
||||
if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) {
|
||||
Slice value = iter_.value();
|
||||
saved_write_unix_time_ = iter_.write_unix_time();
|
||||
if (kTypeValuePreferredSeqno == ikey.type) {
|
||||
value = ParsePackedValueForValue(value);
|
||||
}
|
||||
|
@ -931,6 +933,7 @@ bool DBIter::FindValueForCurrentKey() {
|
|||
case kTypeBlobIndex:
|
||||
case kTypeWideColumnEntity:
|
||||
if (iter_.iter()->IsValuePinned()) {
|
||||
saved_write_unix_time_ = iter_.write_unix_time();
|
||||
if (last_key_entry_type == kTypeValuePreferredSeqno) {
|
||||
pinned_value_ = ParsePackedValueForValue(iter_.value());
|
||||
} else {
|
||||
|
@ -1162,6 +1165,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
|
|||
if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno ||
|
||||
ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) {
|
||||
assert(iter_.iter()->IsValuePinned());
|
||||
saved_write_unix_time_ = iter_.write_unix_time();
|
||||
if (ikey.type == kTypeValuePreferredSeqno) {
|
||||
pinned_value_ = ParsePackedValueForValue(iter_.value());
|
||||
} else {
|
||||
|
|
|
@ -367,6 +367,12 @@ class DBIter final : public Iterator {
|
|||
// and should not be used across functions. Reusing this object can reduce
|
||||
// overhead of calling construction of the function if creating it each time.
|
||||
ParsedInternalKey ikey_;
|
||||
|
||||
// TODO(yuzhangyu): update this documentation for kTypeValuePreferredSeqno
|
||||
// types.
|
||||
// The approximate write time for the entry. It is deduced from the entry's
|
||||
// sequence number if the seqno to time mapping is available.
|
||||
uint64_t saved_write_unix_time_;
|
||||
std::string saved_value_;
|
||||
Slice pinned_value_;
|
||||
// for prefix seek mode to support prev()
|
||||
|
|
|
@ -142,6 +142,13 @@ TEST_P(DBIteratorTest, IteratorProperty) {
|
|||
// Get internal key at which the iteration stopped (tombstone in this case).
|
||||
ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
|
||||
ASSERT_EQ("2", prop_value);
|
||||
|
||||
prop_value.clear();
|
||||
ASSERT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop_value));
|
||||
uint64_t write_time;
|
||||
Slice prop_slice = prop_value;
|
||||
ASSERT_TRUE(GetFixed64(&prop_slice, &write_time));
|
||||
ASSERT_EQ(std::numeric_limits<uint64_t>::max(), write_time);
|
||||
}
|
||||
Close();
|
||||
}
|
||||
|
|
|
@ -374,6 +374,13 @@ inline ValueType ExtractValueType(const Slice& internal_key) {
|
|||
return static_cast<ValueType>(c);
|
||||
}
|
||||
|
||||
// input [internal key]: <user_provided_key | ts | seqno + type>
|
||||
// output: <seqno>
|
||||
inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
|
||||
uint64_t num = ExtractInternalKeyFooter(internal_key);
|
||||
return num >> 8;
|
||||
}
|
||||
|
||||
// A comparator for internal keys that uses a specified comparator for
|
||||
// the user key portion and breaks ties by decreasing sequence number.
|
||||
class InternalKeyComparator
|
||||
|
|
|
@ -417,7 +417,8 @@ Status FlushJob::MemPurge() {
|
|||
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
||||
range_del_iters;
|
||||
for (MemTable* m : mems_) {
|
||||
memtables.push_back(m->NewIterator(ro, &arena));
|
||||
memtables.push_back(
|
||||
m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
|
||||
auto* range_del_iter = m->NewRangeTombstoneIterator(
|
||||
ro, kMaxSequenceNumber, true /* immutable_memtable */);
|
||||
if (range_del_iter != nullptr) {
|
||||
|
@ -897,7 +898,8 @@ Status FlushJob::WriteLevel0Table() {
|
|||
db_options_.info_log,
|
||||
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
|
||||
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
|
||||
memtables.push_back(m->NewIterator(ro, &arena));
|
||||
memtables.push_back(
|
||||
m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
|
||||
auto* range_del_iter = m->NewRangeTombstoneIterator(
|
||||
ro, kMaxSequenceNumber, true /* immutable_memtable */);
|
||||
if (range_del_iter != nullptr) {
|
||||
|
|
|
@ -611,6 +611,11 @@ Slice ForwardIterator::key() const {
|
|||
return current_->key();
|
||||
}
|
||||
|
||||
uint64_t ForwardIterator::write_unix_time() const {
|
||||
assert(valid_);
|
||||
return current_->write_unix_time();
|
||||
}
|
||||
|
||||
Slice ForwardIterator::value() const {
|
||||
assert(valid_);
|
||||
return current_->value();
|
||||
|
@ -704,8 +709,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
|
|||
}
|
||||
ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
|
||||
kMaxSequenceNumber /* upper_bound */);
|
||||
mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
|
||||
sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
|
||||
sv_->GetSeqnoToTimeMapping();
|
||||
mutable_iter_ =
|
||||
sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
|
||||
sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
|
||||
&arena_);
|
||||
if (!read_options_.ignore_range_deletions) {
|
||||
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
||||
sv_->mem->NewRangeTombstoneIterator(
|
||||
|
@ -769,8 +778,12 @@ void ForwardIterator::RenewIterators() {
|
|||
}
|
||||
imm_iters_.clear();
|
||||
|
||||
mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
|
||||
svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
|
||||
svnew->GetSeqnoToTimeMapping();
|
||||
mutable_iter_ =
|
||||
svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
|
||||
svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
|
||||
&arena_);
|
||||
ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
|
||||
kMaxSequenceNumber /* upper_bound */);
|
||||
if (!read_options_.ignore_range_deletions) {
|
||||
|
|
|
@ -76,6 +76,7 @@ class ForwardIterator : public InternalIterator {
|
|||
void Next() override;
|
||||
Slice key() const override;
|
||||
Slice value() const override;
|
||||
uint64_t write_unix_time() const override;
|
||||
Status status() const override;
|
||||
bool PrepareValue() override;
|
||||
Status GetProperty(std::string prop_name, std::string* prop) override;
|
||||
|
|
|
@ -35,6 +35,12 @@ struct SuperVersionContext {
|
|||
std::unique_ptr<SuperVersion>
|
||||
new_superversion; // if nullptr no new superversion
|
||||
|
||||
// If not nullptr, a new seqno to time mapping is available to be installed.
|
||||
// Otherwise, make a shared copy of the one in the existing SuperVersion and
|
||||
// carry it over to the new SuperVersion. This is moved to the SuperVersion
|
||||
// during installation.
|
||||
std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping{nullptr};
|
||||
|
||||
explicit SuperVersionContext(bool create_superversion = false)
|
||||
: new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
|
||||
|
||||
|
|
|
@ -364,11 +364,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
|
|||
class MemTableIterator : public InternalIterator {
|
||||
public:
|
||||
MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
|
||||
Arena* arena, bool use_range_del_table = false)
|
||||
: bloom_(nullptr),
|
||||
prefix_extractor_(mem.prefix_extractor_),
|
||||
comparator_(mem.comparator_),
|
||||
valid_(false),
|
||||
seqno_to_time_mapping_(seqno_to_time_mapping),
|
||||
arena_mode_(arena != nullptr),
|
||||
value_pinned_(
|
||||
!mem.GetImmutableMemTableOptions()->inplace_update_support),
|
||||
|
@ -499,6 +501,18 @@ class MemTableIterator : public InternalIterator {
|
|||
assert(Valid());
|
||||
return GetLengthPrefixedSlice(iter_->key());
|
||||
}
|
||||
|
||||
uint64_t write_unix_time() const override {
|
||||
assert(Valid());
|
||||
// TODO(yuzhangyu): if value type is kTypeValuePreferredSeqno,
|
||||
// parse its unix write time out of packed value.
|
||||
if (!seqno_to_time_mapping_ || seqno_to_time_mapping_->Empty()) {
|
||||
return std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
SequenceNumber seqno = ExtractSequenceNumber(key());
|
||||
return seqno_to_time_mapping_->GetProximalTimeBeforeSeqno(seqno);
|
||||
}
|
||||
|
||||
Slice value() const override {
|
||||
assert(Valid());
|
||||
Slice key_slice = GetLengthPrefixedSlice(iter_->key());
|
||||
|
@ -523,6 +537,8 @@ class MemTableIterator : public InternalIterator {
|
|||
const MemTable::KeyComparator comparator_;
|
||||
MemTableRep::Iterator* iter_;
|
||||
bool valid_;
|
||||
// The seqno to time mapping is owned by the SuperVersion.
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
|
||||
bool arena_mode_;
|
||||
bool value_pinned_;
|
||||
uint32_t protection_bytes_per_key_;
|
||||
|
@ -541,11 +557,13 @@ class MemTableIterator : public InternalIterator {
|
|||
}
|
||||
};
|
||||
|
||||
InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
|
||||
Arena* arena) {
|
||||
InternalIterator* MemTable::NewIterator(
|
||||
const ReadOptions& read_options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena) {
|
||||
assert(arena != nullptr);
|
||||
auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
|
||||
return new (mem) MemTableIterator(*this, read_options, arena);
|
||||
return new (mem)
|
||||
MemTableIterator(*this, read_options, seqno_to_time_mapping, arena);
|
||||
}
|
||||
|
||||
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
|
||||
|
@ -579,9 +597,9 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
|
|||
if (!cache->initialized.load(std::memory_order_acquire)) {
|
||||
cache->reader_mutex.lock();
|
||||
if (!cache->tombstones) {
|
||||
auto* unfragmented_iter =
|
||||
new MemTableIterator(*this, read_options, nullptr /* arena */,
|
||||
true /* use_range_del_table */);
|
||||
auto* unfragmented_iter = new MemTableIterator(
|
||||
*this, read_options, nullptr /* seqno_to_time_mapping= */,
|
||||
nullptr /* arena */, true /* use_range_del_table */);
|
||||
cache->tombstones.reset(new FragmentedRangeTombstoneList(
|
||||
std::unique_ptr<InternalIterator>(unfragmented_iter),
|
||||
comparator_.comparator));
|
||||
|
@ -600,9 +618,9 @@ void MemTable::ConstructFragmentedRangeTombstones() {
|
|||
// There should be no concurrent Construction
|
||||
if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
|
||||
// TODO: plumb Env::IOActivity, Env::IOPriority
|
||||
auto* unfragmented_iter =
|
||||
new MemTableIterator(*this, ReadOptions(), nullptr /* arena */,
|
||||
true /* use_range_del_table */);
|
||||
auto* unfragmented_iter = new MemTableIterator(
|
||||
*this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/,
|
||||
nullptr /* arena */, true /* use_range_del_table */);
|
||||
|
||||
fragmented_range_tombstone_list_ =
|
||||
std::make_unique<FragmentedRangeTombstoneList>(
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "db/kv_checksum.h"
|
||||
#include "db/range_tombstone_fragmenter.h"
|
||||
#include "db/read_callback.h"
|
||||
#include "db/seqno_to_time_mapping.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "memory/allocator.h"
|
||||
#include "memory/concurrent_arena.h"
|
||||
|
@ -28,6 +29,7 @@
|
|||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/memtablerep.h"
|
||||
#include "table/multiget_context.h"
|
||||
#include "util/cast_util.h"
|
||||
#include "util/dynamic_bloom.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/hash_containers.h"
|
||||
|
@ -203,7 +205,11 @@ class MemTable {
|
|||
// arena: If not null, the arena needs to be used to allocate the Iterator.
|
||||
// Calling ~Iterator of the iterator will destroy all the states but
|
||||
// those allocated in arena.
|
||||
InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
|
||||
// seqno_to_time_mapping: it's used to support return write unix time for the
|
||||
// data, currently only needed for iterators serving user reads.
|
||||
InternalIterator* NewIterator(
|
||||
const ReadOptions& read_options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena);
|
||||
|
||||
// Returns an iterator that yields the range tombstones of the memtable.
|
||||
// The caller must ensure that the underlying MemTable remains live
|
||||
|
|
|
@ -211,18 +211,22 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
|
|||
}
|
||||
|
||||
void MemTableListVersion::AddIterators(
|
||||
const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
|
||||
Arena* arena) {
|
||||
const ReadOptions& options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
|
||||
std::vector<InternalIterator*>* iterator_list, Arena* arena) {
|
||||
for (auto& m : memlist_) {
|
||||
iterator_list->push_back(m->NewIterator(options, arena));
|
||||
iterator_list->push_back(
|
||||
m->NewIterator(options, seqno_to_time_mapping, arena));
|
||||
}
|
||||
}
|
||||
|
||||
void MemTableListVersion::AddIterators(const ReadOptions& options,
|
||||
MergeIteratorBuilder* merge_iter_builder,
|
||||
bool add_range_tombstone_iter) {
|
||||
void MemTableListVersion::AddIterators(
|
||||
const ReadOptions& options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
|
||||
MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) {
|
||||
for (auto& m : memlist_) {
|
||||
auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena());
|
||||
auto mem_iter = m->NewIterator(options, seqno_to_time_mapping,
|
||||
merge_iter_builder->GetArena());
|
||||
if (!add_range_tombstone_iter || options.ignore_range_deletions) {
|
||||
merge_iter_builder->AddIterator(mem_iter);
|
||||
} else {
|
||||
|
|
|
@ -112,10 +112,12 @@ class MemTableListVersion {
|
|||
RangeDelAggregator* range_del_agg);
|
||||
|
||||
void AddIterators(const ReadOptions& options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
|
||||
std::vector<InternalIterator*>* iterator_list,
|
||||
Arena* arena);
|
||||
|
||||
void AddIterators(const ReadOptions& options,
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
|
||||
MergeIteratorBuilder* merge_iter_builder,
|
||||
bool add_range_tombstone_iter);
|
||||
|
||||
|
|
|
@ -443,7 +443,8 @@ class Repairer {
|
|||
ReadOptions ro;
|
||||
ro.total_order_seek = true;
|
||||
Arena arena;
|
||||
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
|
||||
ScopedArenaIterator iter(
|
||||
mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
|
||||
int64_t _current_time = 0;
|
||||
immutable_db_options_.clock->GetCurrentTime(&_current_time)
|
||||
.PermitUncheckedError(); // ignore error
|
||||
|
|
|
@ -59,7 +59,8 @@ static std::string PrintContents(WriteBatch* b,
|
|||
std::unique_ptr<InternalIterator> iter_guard;
|
||||
InternalIterator* iter;
|
||||
if (i == 0) {
|
||||
iter = mem->NewIterator(ReadOptions(), &arena);
|
||||
iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
|
||||
&arena);
|
||||
arena_iter_guard.set(iter);
|
||||
} else {
|
||||
iter = mem->NewRangeTombstoneIterator(ReadOptions(),
|
||||
|
|
|
@ -137,15 +137,18 @@ class Iterator : public Cleanable {
|
|||
// Get the user-key portion of the internal key at which the iteration
|
||||
// stopped.
|
||||
// Property "rocksdb.iterator.write-time":
|
||||
// DO NOT USE, UNDER CONSTRUCTION
|
||||
// Get the unix time of the best estimate of the write time of the entry.
|
||||
// Returned as 64-bit raw value (8 bytes). It can be converted to uint64_t
|
||||
// with util method `DecodeU64Ts`. The accuracy of the write time depends on
|
||||
// settings like preserve_internal_time_seconds. If this feature is
|
||||
// disabled, this property will always be empty. The actual write time of
|
||||
// settings like preserve_internal_time_seconds. The actual write time of
|
||||
// the entry should be the same or newer than the returned write time. So
|
||||
// this property can be interpreted as the possible oldest write time for
|
||||
// the entry.
|
||||
// If the seqno to time mapping recording is not enabled,
|
||||
// std::numeric_limits<uint64_t>::max() will be returned to indicate the
|
||||
// write time is unknown. For data entry whose sequence number has
|
||||
// been zeroed out (possible when they reach the last level), 0 is returned
|
||||
// no matter whether the seqno to time recording feature is enabled or not.
|
||||
virtual Status GetProperty(std::string prop_name, std::string* prop);
|
||||
|
||||
virtual Slice timestamp() const {
|
||||
|
|
|
@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
|
|||
unsigned int count = 0;
|
||||
ROCKSDB_NAMESPACE::Arena arena;
|
||||
ROCKSDB_NAMESPACE::ScopedArenaIterator iter(
|
||||
mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), &arena));
|
||||
mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(),
|
||||
/*seqno_to_time_mapping=*/nullptr, &arena));
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
|
||||
ikey.clear();
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#pragma once
|
||||
#include <deque>
|
||||
|
||||
#include "db/seqno_to_time_mapping.h"
|
||||
#include "table/block_based/block_based_table_reader.h"
|
||||
#include "table/block_based/block_based_table_reader_impl.h"
|
||||
#include "table/block_based/block_prefetcher.h"
|
||||
|
@ -92,6 +93,22 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
|
|||
return const_cast<BlockBasedTableIterator*>(this)
|
||||
->MaterializeCurrentBlock();
|
||||
}
|
||||
|
||||
uint64_t write_unix_time() const override {
|
||||
assert(Valid());
|
||||
// TODO(yuzhangyu): if value type is kTypeValuePreferredSeqno,
|
||||
// parse its unix write time out of packed value.
|
||||
const SeqnoToTimeMapping& seqno_to_time_mapping =
|
||||
table_->GetSeqnoToTimeMapping();
|
||||
SequenceNumber seqno = ExtractSequenceNumber(key());
|
||||
if (kUnknownSeqnoBeforeAll == seqno) {
|
||||
return kUnknownTimeBeforeAll;
|
||||
} else if (seqno_to_time_mapping.Empty()) {
|
||||
return std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
return seqno_to_time_mapping.GetProximalTimeBeforeSeqno(seqno);
|
||||
}
|
||||
|
||||
Slice value() const override {
|
||||
// PrepareValue() must have been called.
|
||||
assert(!is_at_first_key_from_index_);
|
||||
|
|
|
@ -921,6 +921,17 @@ Status BlockBasedTable::ReadPropertiesBlock(
|
|||
} else {
|
||||
assert(table_properties != nullptr);
|
||||
rep_->table_properties = std::move(table_properties);
|
||||
|
||||
if (s.ok()) {
|
||||
s = rep_->seqno_to_time_mapping.DecodeFrom(
|
||||
rep_->table_properties->seqno_to_time_mapping);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
ROCKS_LOG_WARN(
|
||||
rep_->ioptions.logger,
|
||||
"Problem reading or processing seqno-to-time mapping: %s",
|
||||
s.ToString().c_str());
|
||||
}
|
||||
rep_->blocks_maybe_compressed =
|
||||
rep_->table_properties->compression_name !=
|
||||
CompressionTypeToString(kNoCompression);
|
||||
|
@ -1233,6 +1244,10 @@ std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
|
|||
return rep_->table_properties;
|
||||
}
|
||||
|
||||
const SeqnoToTimeMapping& BlockBasedTable::GetSeqnoToTimeMapping() const {
|
||||
return rep_->seqno_to_time_mapping;
|
||||
}
|
||||
|
||||
size_t BlockBasedTable::ApproximateMemoryUsage() const {
|
||||
size_t usage = 0;
|
||||
if (rep_) {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "cache/cache_key.h"
|
||||
#include "cache/cache_reservation_manager.h"
|
||||
#include "db/range_tombstone_fragmenter.h"
|
||||
#include "db/seqno_to_time_mapping.h"
|
||||
#include "file/filename.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
|
@ -197,6 +198,8 @@ class BlockBasedTable : public TableReader {
|
|||
|
||||
std::shared_ptr<const TableProperties> GetTableProperties() const override;
|
||||
|
||||
const SeqnoToTimeMapping& GetSeqnoToTimeMapping() const;
|
||||
|
||||
size_t ApproximateMemoryUsage() const override;
|
||||
|
||||
// convert SST file to a human readable form
|
||||
|
@ -607,6 +610,7 @@ struct BlockBasedTable::Rep {
|
|||
BlockHandle compression_dict_handle;
|
||||
|
||||
std::shared_ptr<const TableProperties> table_properties;
|
||||
SeqnoToTimeMapping seqno_to_time_mapping;
|
||||
BlockHandle index_handle;
|
||||
BlockBasedTableOptions::IndexType index_type;
|
||||
bool whole_key_filtering;
|
||||
|
|
|
@ -116,6 +116,14 @@ class InternalIteratorBase : public Cleanable {
|
|||
// REQUIRES: Valid()
|
||||
virtual Slice key() const = 0;
|
||||
|
||||
// Returns the approximate write time of this entry, which is deduced from
|
||||
// sequence number if sequence number to time mapping is available.
|
||||
// The default implementation returns maximum uint64_t and that indicates the
|
||||
// write time is unknown.
|
||||
virtual uint64_t write_unix_time() const {
|
||||
return std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
|
||||
// Return user key for the current entry.
|
||||
// REQUIRES: Valid()
|
||||
virtual Slice user_key() const { return ExtractUserKey(key()); }
|
||||
|
|
|
@ -82,6 +82,12 @@ class IteratorWrapperBase {
|
|||
assert(Valid());
|
||||
return result_.key;
|
||||
}
|
||||
|
||||
uint64_t write_unix_time() const {
|
||||
assert(Valid());
|
||||
return iter_->write_unix_time();
|
||||
}
|
||||
|
||||
TValue value() const {
|
||||
assert(Valid());
|
||||
return iter_->value();
|
||||
|
|
|
@ -430,6 +430,11 @@ class MergingIterator : public InternalIterator {
|
|||
return current_->key();
|
||||
}
|
||||
|
||||
uint64_t write_unix_time() const override {
|
||||
assert(Valid());
|
||||
return current_->write_unix_time();
|
||||
}
|
||||
|
||||
Slice value() const override {
|
||||
assert(Valid());
|
||||
return current_->value();
|
||||
|
|
|
@ -534,7 +534,9 @@ class MemTableConstructor : public Constructor {
|
|||
InternalIterator* NewIterator(
|
||||
const SliceTransform* /*prefix_extractor*/) const override {
|
||||
return new KeyConvertingIterator(
|
||||
memtable_->NewIterator(ReadOptions(), &arena_), true);
|
||||
memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
|
||||
&arena_),
|
||||
true);
|
||||
}
|
||||
|
||||
bool AnywayDeleteIterator() const override { return true; }
|
||||
|
@ -4897,7 +4899,8 @@ TEST_F(MemTableTest, Simple) {
|
|||
std::unique_ptr<InternalIterator> iter_guard;
|
||||
InternalIterator* iter;
|
||||
if (i == 0) {
|
||||
iter = GetMemTable()->NewIterator(ReadOptions(), &arena);
|
||||
iter = GetMemTable()->NewIterator(
|
||||
ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena);
|
||||
arena_iter_guard.set(iter);
|
||||
} else {
|
||||
iter = GetMemTable()->NewRangeTombstoneIterator(
|
||||
|
|
Loading…
Reference in New Issue