Support returning write unix time in iterator property (#12428)

Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.

2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in  https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.

Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.

2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.

3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428

Test Plan: Added unit test

Reviewed By: pdillinger

Differential Revision: D54967067

Pulled By: jowlyzhang

fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
This commit is contained in:
Yu Zhang 2024-03-15 15:37:37 -07:00 committed by Facebook GitHub Bot
parent 4d5ebad971
commit f2546b6623
29 changed files with 510 additions and 47 deletions

View file

@ -477,13 +477,16 @@ void SuperVersion::Cleanup() {
cfd->UnrefAndTryDelete(); cfd->UnrefAndTryDelete();
} }
void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, void SuperVersion::Init(
MemTableListVersion* new_imm, Version* new_current) { ColumnFamilyData* new_cfd, MemTable* new_mem, MemTableListVersion* new_imm,
Version* new_current,
std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping) {
cfd = new_cfd; cfd = new_cfd;
mem = new_mem; mem = new_mem;
imm = new_imm; imm = new_imm;
current = new_current; current = new_current;
full_history_ts_low = cfd->GetFullHistoryTsLow(); full_history_ts_low = cfd->GetFullHistoryTsLow();
seqno_to_time_mapping = std::move(new_seqno_to_time_mapping);
cfd->Ref(); cfd->Ref();
mem->Ref(); mem->Ref();
imm->Ref(); imm->Ref();
@ -1196,9 +1199,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
ReadOptions read_opts; ReadOptions read_opts;
read_opts.total_order_seek = true; read_opts.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
merge_iter_builder.AddIterator( merge_iter_builder.AddIterator(super_version->mem->NewIterator(
super_version->mem->NewIterator(read_opts, &arena)); read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
super_version->imm->AddIterators(read_opts, &merge_iter_builder, super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
&merge_iter_builder,
false /* add_range_tombstone_iter */); false /* add_range_tombstone_iter */);
ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
@ -1336,7 +1340,12 @@ void ColumnFamilyData::InstallSuperVersion(
const MutableCFOptions& mutable_cf_options) { const MutableCFOptions& mutable_cf_options) {
SuperVersion* new_superversion = sv_context->new_superversion.release(); SuperVersion* new_superversion = sv_context->new_superversion.release();
new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->mutable_cf_options = mutable_cf_options;
new_superversion->Init(this, mem_, imm_.current(), current_); new_superversion->Init(this, mem_, imm_.current(), current_,
sv_context->new_seqno_to_time_mapping
? std::move(sv_context->new_seqno_to_time_mapping)
: super_version_
? super_version_->ShareSeqnoToTimeMapping()
: nullptr);
SuperVersion* old_superversion = super_version_; SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion; super_version_ = new_superversion;
if (old_superversion == nullptr || old_superversion->current != current() || if (old_superversion == nullptr || old_superversion->current != current() ||

View file

@ -26,6 +26,7 @@
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "trace_replay/block_cache_tracer.h" #include "trace_replay/block_cache_tracer.h"
#include "util/cast_util.h"
#include "util/hash_containers.h" #include "util/hash_containers.h"
#include "util/thread_local.h" #include "util/thread_local.h"
@ -219,6 +220,9 @@ struct SuperVersion {
// enable UDT feature, this is an empty string. // enable UDT feature, this is an empty string.
std::string full_history_ts_low; std::string full_history_ts_low;
// A shared copy of the DB's seqno to time mapping.
std::shared_ptr<const SeqnoToTimeMapping> seqno_to_time_mapping{nullptr};
// should be called outside the mutex // should be called outside the mutex
SuperVersion() = default; SuperVersion() = default;
~SuperVersion(); ~SuperVersion();
@ -232,8 +236,23 @@ struct SuperVersion {
// that needs to be deleted in to_delete vector. Unrefing those // that needs to be deleted in to_delete vector. Unrefing those
// objects needs to be done in the mutex // objects needs to be done in the mutex
void Cleanup(); void Cleanup();
void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, void Init(
MemTableListVersion* new_imm, Version* new_current); ColumnFamilyData* new_cfd, MemTable* new_mem,
MemTableListVersion* new_imm, Version* new_current,
std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping);
// Share the ownership of the seqno to time mapping object referred to in this
// SuperVersion. To be used by the new SuperVersion to be installed after this
// one if seqno to time mapping does not change in between these two
// SuperVersions.
std::shared_ptr<const SeqnoToTimeMapping> ShareSeqnoToTimeMapping() {
return seqno_to_time_mapping;
}
// Access the seqno to time mapping object in this SuperVersion.
UnownedPtr<const SeqnoToTimeMapping> GetSeqnoToTimeMapping() const {
return seqno_to_time_mapping.get();
}
// The value of dummy is not actually used. kSVInUse takes its address as a // The value of dummy is not actually used. kSVInUse takes its address as a
// mark in the thread local storage to indicate the SuperVersion is in use // mark in the thread local storage to indicate the SuperVersion is in use

View file

@ -14,6 +14,7 @@
#include "rocksdb/listener.h" #include "rocksdb/listener.h"
#include "rocksdb/utilities/debug.h" #include "rocksdb/utilities/debug.h"
#include "test_util/mock_time_env.h" #include "test_util/mock_time_env.h"
#include "utilities/merge_operators.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -1307,8 +1308,8 @@ TEST_F(TieredCompactionTest, CheckInternalKeyRange) {
class PrecludeLastLevelTest : public DBTestBase { class PrecludeLastLevelTest : public DBTestBase {
public: public:
PrecludeLastLevelTest() PrecludeLastLevelTest(std::string test_name = "preclude_last_level_test")
: DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) { : DBTestBase(test_name, /*env_do_fsync=*/false) {
mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock()); mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
mock_clock_->SetCurrentTime(kMockStartTime); mock_clock_->SetCurrentTime(kMockStartTime);
mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_); mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
@ -2256,6 +2257,253 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
Close(); Close();
} }
// Tests DBIter::GetProperty("rocksdb.iterator.write-time") return a data's
// approximate write unix time.
// Test Param:
// 1) use tailing iterator or regular iterator (when it applies)
class IteratorWriteTimeTest : public PrecludeLastLevelTest,
public testing::WithParamInterface<bool> {
public:
IteratorWriteTimeTest() : PrecludeLastLevelTest("iterator_write_time_test") {}
uint64_t VerifyKeyAndGetWriteTime(Iterator* iter,
const std::string& expected_key) {
std::string prop;
uint64_t write_time = 0;
EXPECT_TRUE(iter->Valid());
EXPECT_EQ(expected_key, iter->key());
EXPECT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop));
Slice prop_slice = prop;
EXPECT_TRUE(GetFixed64(&prop_slice, &write_time));
return write_time;
}
void VerifyKeyAndWriteTime(Iterator* iter, const std::string& expected_key,
uint64_t expected_write_time) {
std::string prop;
uint64_t write_time = 0;
EXPECT_TRUE(iter->Valid());
EXPECT_EQ(expected_key, iter->key());
EXPECT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop));
Slice prop_slice = prop;
EXPECT_TRUE(GetFixed64(&prop_slice, &write_time));
EXPECT_EQ(expected_write_time, write_time);
}
};
TEST_P(IteratorWriteTimeTest, ReadFromMemtables) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
const int kSecondsPerRecording = 101;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.env = mock_env_.get();
options.level0_file_num_compaction_trigger = kNumTrigger;
options.preserve_internal_time_seconds = 10000;
options.num_levels = kNumLevels;
DestroyAndReopen(options);
Random rnd(301);
for (int i = 0; i < kNumKeys; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
}
ReadOptions ropts;
ropts.tailing = GetParam();
int i;
// Forward iteration
uint64_t start_time = 0;
{
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
for (iter->SeekToFirst(), i = 0; iter->Valid(); iter->Next(), i++) {
if (start_time == 0) {
start_time = VerifyKeyAndGetWriteTime(iter.get(), Key(i));
} else {
VerifyKeyAndWriteTime(iter.get(), Key(i),
start_time + kSecondsPerRecording * (i + 1));
}
}
ASSERT_OK(iter->status());
}
// Backward iteration
{
ropts.tailing = false;
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
for (iter->SeekToLast(), i = kNumKeys - 1; iter->Valid();
iter->Prev(), i--) {
if (i == 0) {
VerifyKeyAndWriteTime(iter.get(), Key(i), start_time);
} else {
VerifyKeyAndWriteTime(iter.get(), Key(i),
start_time + kSecondsPerRecording * (i + 1));
}
}
ASSERT_OK(iter->status());
}
Close();
}
TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
const int kSecondsPerRecording = 101;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.env = mock_env_.get();
options.level0_file_num_compaction_trigger = kNumTrigger;
options.preserve_internal_time_seconds = 10000;
options.num_levels = kNumLevels;
DestroyAndReopen(options);
Random rnd(301);
for (int i = 0; i < kNumKeys; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
}
ASSERT_OK(Flush());
ReadOptions ropts;
ropts.tailing = GetParam();
std::string prop;
int i;
// Forward iteration
uint64_t start_time = 0;
{
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
for (iter->SeekToFirst(), i = 0; iter->Valid(); iter->Next(), i++) {
if (start_time == 0) {
start_time = VerifyKeyAndGetWriteTime(iter.get(), Key(i));
} else {
VerifyKeyAndWriteTime(iter.get(), Key(i),
start_time + kSecondsPerRecording * (i + 1));
}
}
ASSERT_OK(iter->status());
}
// Backward iteration
{
ropts.tailing = false;
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
for (iter->SeekToLast(), i = kNumKeys - 1; iter->Valid();
iter->Prev(), i--) {
if (i == 0) {
VerifyKeyAndWriteTime(iter.get(), Key(i), start_time);
} else {
VerifyKeyAndWriteTime(iter.get(), Key(i),
start_time + kSecondsPerRecording * (i + 1));
}
}
ASSERT_OK(iter->status());
}
// Reopen the DB and disable the seqno to time recording. Data retrieved from
// SST files still have write time available.
options.preserve_internal_time_seconds = 0;
DestroyAndReopen(options);
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
ASSERT_OK(Put("a", "val"));
ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
{
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
// "a" is retrieved from memtable, its write time is unknown because the
// seqno to time mapping recording is not available.
VerifyKeyAndWriteTime(iter.get(), "a",
std::numeric_limits<uint64_t>::max());
for (iter->Next(), i = 0; iter->Valid(); iter->Next(), i++) {
if (i == 0) {
VerifyKeyAndWriteTime(iter.get(), Key(i), start_time);
} else {
VerifyKeyAndWriteTime(iter.get(), Key(i),
start_time + kSecondsPerRecording * (i + 1));
}
}
ASSERT_OK(iter->status());
}
// There is no write time info for "a" after it's flushed to SST file either.
ASSERT_OK(Flush());
{
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
VerifyKeyAndWriteTime(iter.get(), "a",
std::numeric_limits<uint64_t>::max());
}
// Sequence number zeroed out after compacted to the last level, write time
// all becomes zero.
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
{
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
iter->SeekToFirst();
for (iter->Next(), i = 0; iter->Valid(); iter->Next(), i++) {
VerifyKeyAndWriteTime(iter.get(), Key(i), 0);
}
ASSERT_OK(iter->status());
}
Close();
}
TEST_P(IteratorWriteTimeTest, MergeReturnsBaseValueWriteTime) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kSecondsPerRecording = 101;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.env = mock_env_.get();
options.level0_file_num_compaction_trigger = kNumTrigger;
options.preserve_internal_time_seconds = 10000;
options.num_levels = kNumLevels;
options.merge_operator = MergeOperators::CreateStringAppendOperator();
DestroyAndReopen(options);
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
ASSERT_OK(Put("foo", "fv1"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
ASSERT_OK(Put("bar", "bv1"));
ASSERT_OK(Merge("foo", "bv1"));
ReadOptions ropts;
ropts.tailing = GetParam();
{
std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
iter->SeekToFirst();
uint64_t bar_time = VerifyKeyAndGetWriteTime(iter.get(), "bar");
iter->Next();
uint64_t foo_time = VerifyKeyAndGetWriteTime(iter.get(), "foo");
// "foo" has an older write time because its base value's write time is used
ASSERT_GT(bar_time, foo_time);
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
}
Close();
}
INSTANTIATE_TEST_CASE_P(IteratorWriteTimeTest, IteratorWriteTimeTest,
testing::Bool());
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View file

@ -16,6 +16,7 @@
#include <cinttypes> #include <cinttypes>
#include <cstdio> #include <cstdio>
#include <map> #include <map>
#include <memory>
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
@ -1979,7 +1980,8 @@ InternalIterator* DBImpl::NewInternalIterator(
super_version->mutable_cf_options.prefix_extractor != nullptr, super_version->mutable_cf_options.prefix_extractor != nullptr,
read_options.iterate_upper_bound); read_options.iterate_upper_bound);
// Collect iterator for mutable memtable // Collect iterator for mutable memtable
auto mem_iter = super_version->mem->NewIterator(read_options, arena); auto mem_iter = super_version->mem->NewIterator(
read_options, super_version->GetSeqnoToTimeMapping(), arena);
Status s; Status s;
if (!read_options.ignore_range_deletions) { if (!read_options.ignore_range_deletions) {
TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
@ -2001,8 +2003,9 @@ InternalIterator* DBImpl::NewInternalIterator(
// Collect all needed child iterators for immutable memtables // Collect all needed child iterators for immutable memtables
if (s.ok()) { if (s.ok()) {
super_version->imm->AddIterators(read_options, &merge_iter_builder, super_version->imm->AddIterators(
!read_options.ignore_range_deletions); read_options, super_version->GetSeqnoToTimeMapping(),
&merge_iter_builder, !read_options.ignore_range_deletions);
} }
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
if (s.ok()) { if (s.ok()) {
@ -6466,6 +6469,8 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
immutable_db_options_.clock->GetCurrentTime(&unix_time_signed) immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
.PermitUncheckedError(); // Ignore error .PermitUncheckedError(); // Ignore error
uint64_t unix_time = static_cast<uint64_t>(unix_time_signed); uint64_t unix_time = static_cast<uint64_t>(unix_time_signed);
std::vector<SuperVersionContext> sv_contexts;
if (populate_historical_seconds > 0) { if (populate_historical_seconds > 0) {
bool success = true; bool success = true;
{ {
@ -6476,6 +6481,7 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
success = seqno_to_time_mapping_.PrePopulate( success = seqno_to_time_mapping_.PrePopulate(
from_seqno, seqno, unix_time - populate_historical_seconds, from_seqno, seqno, unix_time - populate_historical_seconds,
unix_time); unix_time);
InstallSeqnoToTimeMappingInSV(&sv_contexts);
} else { } else {
// One of these will fail // One of these will fail
assert(seqno > 1); assert(seqno > 1);
@ -6501,7 +6507,31 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
// FIXME: assert(seqno > 0); // FIXME: assert(seqno > 0);
// Always successful assuming seqno never go backwards // Always successful assuming seqno never go backwards
seqno_to_time_mapping_.Append(seqno, unix_time); seqno_to_time_mapping_.Append(seqno, unix_time);
InstallSeqnoToTimeMappingInSV(&sv_contexts);
}
// clean up outside db mutex
for (SuperVersionContext& sv_context : sv_contexts) {
sv_context.Clean();
} }
} }
void DBImpl::InstallSeqnoToTimeMappingInSV(
std::vector<SuperVersionContext>* sv_contexts) {
mutex_.AssertHeld();
std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
std::make_shared<SeqnoToTimeMapping>();
new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
sv_contexts->emplace_back(/*create_superversion=*/true);
sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping;
cfd->InstallSuperVersion(&sv_contexts->back(),
*(cfd->GetLatestMutableCFOptions()));
}
bg_cv_.SignalAll();
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View file

@ -1226,6 +1226,22 @@ class DBImpl : public DB {
// populate_historical_seconds, now]. // populate_historical_seconds, now].
void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds); void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds);
// Everytime DB's seqno to time mapping changed (which already hold the db
// mutex), we install a new SuperVersion in each column family with a shared
// copy of the new mapping while holding the db mutex.
// This is done for all column families even though the column family does not
// explicitly enabled the
// `preclude_last_level_data_seconds` or `preserve_internal_time_seconds`
// features.
// This mapping supports iterators to fulfill the
// "rocksdb.iterator.write-time" iterator property for entries in memtables.
//
// Since this new SuperVersion doesn't involve an LSM tree shape change, we
// don't schedule work after installing this SuperVersion. It returns the used
// `SuperVersionContext` for clean up after release mutex.
void InstallSeqnoToTimeMappingInSV(
std::vector<SuperVersionContext>* sv_contexts);
// Interface to block and signal the DB in case of stalling writes by // Interface to block and signal the DB in case of stalling writes by
// WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
// When DB needs to be blocked or signalled by WriteBufferManager, // When DB needs to be blocked or signalled by WriteBufferManager,

View file

@ -1630,7 +1630,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
Status s; Status s;
TableProperties table_properties; TableProperties table_properties;
{ {
ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); ScopedArenaIterator iter(
mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]" "[%s] [WriteLevel0TableForRecovery]"
" Level-0 table #%" PRIu64 ": started", " Level-0 table #%" PRIu64 ": started",

View file

@ -113,8 +113,8 @@ Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
*prop = saved_key_.GetUserKey().ToString(); *prop = saved_key_.GetUserKey().ToString();
return Status::OK(); return Status::OK();
} else if (prop_name == "rocksdb.iterator.write-time") { } else if (prop_name == "rocksdb.iterator.write-time") {
// TODO(yuzhangyu): implement return the actual write time. PutFixed64(prop, saved_write_unix_time_);
return Status::NotSupported("write time property is under construction"); return Status::OK();
} }
return Status::InvalidArgument("Unidentified property."); return Status::InvalidArgument("Unidentified property.");
} }
@ -421,6 +421,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
assert(ikey_.type == kTypeValue || assert(ikey_.type == kTypeValue ||
ikey_.type == kTypeValuePreferredSeqno); ikey_.type == kTypeValuePreferredSeqno);
Slice value = iter_.value(); Slice value = iter_.value();
saved_write_unix_time_ = iter_.write_unix_time();
if (ikey_.type == kTypeValuePreferredSeqno) { if (ikey_.type == kTypeValuePreferredSeqno) {
value = ParsePackedValueForValue(value); value = ParsePackedValueForValue(value);
} }
@ -582,6 +583,7 @@ bool DBIter::MergeValuesNewToOld() {
if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) { if (kTypeValue == ikey.type || kTypeValuePreferredSeqno == ikey.type) {
Slice value = iter_.value(); Slice value = iter_.value();
saved_write_unix_time_ = iter_.write_unix_time();
if (kTypeValuePreferredSeqno == ikey.type) { if (kTypeValuePreferredSeqno == ikey.type) {
value = ParsePackedValueForValue(value); value = ParsePackedValueForValue(value);
} }
@ -931,6 +933,7 @@ bool DBIter::FindValueForCurrentKey() {
case kTypeBlobIndex: case kTypeBlobIndex:
case kTypeWideColumnEntity: case kTypeWideColumnEntity:
if (iter_.iter()->IsValuePinned()) { if (iter_.iter()->IsValuePinned()) {
saved_write_unix_time_ = iter_.write_unix_time();
if (last_key_entry_type == kTypeValuePreferredSeqno) { if (last_key_entry_type == kTypeValuePreferredSeqno) {
pinned_value_ = ParsePackedValueForValue(iter_.value()); pinned_value_ = ParsePackedValueForValue(iter_.value());
} else { } else {
@ -1162,6 +1165,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno || if (ikey.type == kTypeValue || ikey.type == kTypeValuePreferredSeqno ||
ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) { ikey.type == kTypeBlobIndex || ikey.type == kTypeWideColumnEntity) {
assert(iter_.iter()->IsValuePinned()); assert(iter_.iter()->IsValuePinned());
saved_write_unix_time_ = iter_.write_unix_time();
if (ikey.type == kTypeValuePreferredSeqno) { if (ikey.type == kTypeValuePreferredSeqno) {
pinned_value_ = ParsePackedValueForValue(iter_.value()); pinned_value_ = ParsePackedValueForValue(iter_.value());
} else { } else {

View file

@ -367,6 +367,12 @@ class DBIter final : public Iterator {
// and should not be used across functions. Reusing this object can reduce // and should not be used across functions. Reusing this object can reduce
// overhead of calling construction of the function if creating it each time. // overhead of calling construction of the function if creating it each time.
ParsedInternalKey ikey_; ParsedInternalKey ikey_;
// TODO(yuzhangyu): update this documentation for kTypeValuePreferredSeqno
// types.
// The approximate write time for the entry. It is deduced from the entry's
// sequence number if the seqno to time mapping is available.
uint64_t saved_write_unix_time_;
std::string saved_value_; std::string saved_value_;
Slice pinned_value_; Slice pinned_value_;
// for prefix seek mode to support prev() // for prefix seek mode to support prev()

View file

@ -142,6 +142,13 @@ TEST_P(DBIteratorTest, IteratorProperty) {
// Get internal key at which the iteration stopped (tombstone in this case). // Get internal key at which the iteration stopped (tombstone in this case).
ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
ASSERT_EQ("2", prop_value); ASSERT_EQ("2", prop_value);
prop_value.clear();
ASSERT_OK(iter->GetProperty("rocksdb.iterator.write-time", &prop_value));
uint64_t write_time;
Slice prop_slice = prop_value;
ASSERT_TRUE(GetFixed64(&prop_slice, &write_time));
ASSERT_EQ(std::numeric_limits<uint64_t>::max(), write_time);
} }
Close(); Close();
} }

View file

@ -374,6 +374,13 @@ inline ValueType ExtractValueType(const Slice& internal_key) {
return static_cast<ValueType>(c); return static_cast<ValueType>(c);
} }
// input [internal key]: <user_provided_key | ts | seqno + type>
// output: <seqno>
inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
uint64_t num = ExtractInternalKeyFooter(internal_key);
return num >> 8;
}
// A comparator for internal keys that uses a specified comparator for // A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number. // the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator class InternalKeyComparator

View file

@ -417,7 +417,8 @@ Status FlushJob::MemPurge() {
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>> std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters; range_del_iters;
for (MemTable* m : mems_) { for (MemTable* m : mems_) {
memtables.push_back(m->NewIterator(ro, &arena)); memtables.push_back(
m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
auto* range_del_iter = m->NewRangeTombstoneIterator( auto* range_del_iter = m->NewRangeTombstoneIterator(
ro, kMaxSequenceNumber, true /* immutable_memtable */); ro, kMaxSequenceNumber, true /* immutable_memtable */);
if (range_del_iter != nullptr) { if (range_del_iter != nullptr) {
@ -897,7 +898,8 @@ Status FlushJob::WriteLevel0Table() {
db_options_.info_log, db_options_.info_log,
"[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
memtables.push_back(m->NewIterator(ro, &arena)); memtables.push_back(
m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
auto* range_del_iter = m->NewRangeTombstoneIterator( auto* range_del_iter = m->NewRangeTombstoneIterator(
ro, kMaxSequenceNumber, true /* immutable_memtable */); ro, kMaxSequenceNumber, true /* immutable_memtable */);
if (range_del_iter != nullptr) { if (range_del_iter != nullptr) {

View file

@ -611,6 +611,11 @@ Slice ForwardIterator::key() const {
return current_->key(); return current_->key();
} }
uint64_t ForwardIterator::write_unix_time() const {
assert(valid_);
return current_->write_unix_time();
}
Slice ForwardIterator::value() const { Slice ForwardIterator::value() const {
assert(valid_); assert(valid_);
return current_->value(); return current_->value();
@ -704,8 +709,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
} }
ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
kMaxSequenceNumber /* upper_bound */); kMaxSequenceNumber /* upper_bound */);
mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); sv_->GetSeqnoToTimeMapping();
mutable_iter_ =
sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
&arena_);
if (!read_options_.ignore_range_deletions) { if (!read_options_.ignore_range_deletions) {
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
sv_->mem->NewRangeTombstoneIterator( sv_->mem->NewRangeTombstoneIterator(
@ -769,8 +778,12 @@ void ForwardIterator::RenewIterators() {
} }
imm_iters_.clear(); imm_iters_.clear();
mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); svnew->GetSeqnoToTimeMapping();
mutable_iter_ =
svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
&arena_);
ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
kMaxSequenceNumber /* upper_bound */); kMaxSequenceNumber /* upper_bound */);
if (!read_options_.ignore_range_deletions) { if (!read_options_.ignore_range_deletions) {

View file

@ -76,6 +76,7 @@ class ForwardIterator : public InternalIterator {
void Next() override; void Next() override;
Slice key() const override; Slice key() const override;
Slice value() const override; Slice value() const override;
uint64_t write_unix_time() const override;
Status status() const override; Status status() const override;
bool PrepareValue() override; bool PrepareValue() override;
Status GetProperty(std::string prop_name, std::string* prop) override; Status GetProperty(std::string prop_name, std::string* prop) override;

View file

@ -35,6 +35,12 @@ struct SuperVersionContext {
std::unique_ptr<SuperVersion> std::unique_ptr<SuperVersion>
new_superversion; // if nullptr no new superversion new_superversion; // if nullptr no new superversion
// If not nullptr, a new seqno to time mapping is available to be installed.
// Otherwise, make a shared copy of the one in the existing SuperVersion and
// carry it over to the new SuperVersion. This is moved to the SuperVersion
// during installation.
std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping{nullptr};
explicit SuperVersionContext(bool create_superversion = false) explicit SuperVersionContext(bool create_superversion = false)
: new_superversion(create_superversion ? new SuperVersion() : nullptr) {} : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}

View file

@ -364,11 +364,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator : public InternalIterator { class MemTableIterator : public InternalIterator {
public: public:
MemTableIterator(const MemTable& mem, const ReadOptions& read_options, MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
Arena* arena, bool use_range_del_table = false) Arena* arena, bool use_range_del_table = false)
: bloom_(nullptr), : bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_), prefix_extractor_(mem.prefix_extractor_),
comparator_(mem.comparator_), comparator_(mem.comparator_),
valid_(false), valid_(false),
seqno_to_time_mapping_(seqno_to_time_mapping),
arena_mode_(arena != nullptr), arena_mode_(arena != nullptr),
value_pinned_( value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support), !mem.GetImmutableMemTableOptions()->inplace_update_support),
@ -499,6 +501,18 @@ class MemTableIterator : public InternalIterator {
assert(Valid()); assert(Valid());
return GetLengthPrefixedSlice(iter_->key()); return GetLengthPrefixedSlice(iter_->key());
} }
uint64_t write_unix_time() const override {
assert(Valid());
// TODO(yuzhangyu): if value type is kTypeValuePreferredSeqno,
// parse its unix write time out of packed value.
if (!seqno_to_time_mapping_ || seqno_to_time_mapping_->Empty()) {
return std::numeric_limits<uint64_t>::max();
}
SequenceNumber seqno = ExtractSequenceNumber(key());
return seqno_to_time_mapping_->GetProximalTimeBeforeSeqno(seqno);
}
Slice value() const override { Slice value() const override {
assert(Valid()); assert(Valid());
Slice key_slice = GetLengthPrefixedSlice(iter_->key()); Slice key_slice = GetLengthPrefixedSlice(iter_->key());
@ -523,6 +537,8 @@ class MemTableIterator : public InternalIterator {
const MemTable::KeyComparator comparator_; const MemTable::KeyComparator comparator_;
MemTableRep::Iterator* iter_; MemTableRep::Iterator* iter_;
bool valid_; bool valid_;
// The seqno to time mapping is owned by the SuperVersion.
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
bool arena_mode_; bool arena_mode_;
bool value_pinned_; bool value_pinned_;
uint32_t protection_bytes_per_key_; uint32_t protection_bytes_per_key_;
@ -541,11 +557,13 @@ class MemTableIterator : public InternalIterator {
} }
}; };
InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, InternalIterator* MemTable::NewIterator(
Arena* arena) { const ReadOptions& read_options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena) {
assert(arena != nullptr); assert(arena != nullptr);
auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem) MemTableIterator(*this, read_options, arena); return new (mem)
MemTableIterator(*this, read_options, seqno_to_time_mapping, arena);
} }
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
@ -579,9 +597,9 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
if (!cache->initialized.load(std::memory_order_acquire)) { if (!cache->initialized.load(std::memory_order_acquire)) {
cache->reader_mutex.lock(); cache->reader_mutex.lock();
if (!cache->tombstones) { if (!cache->tombstones) {
auto* unfragmented_iter = auto* unfragmented_iter = new MemTableIterator(
new MemTableIterator(*this, read_options, nullptr /* arena */, *this, read_options, nullptr /* seqno_to_time_mapping= */,
true /* use_range_del_table */); nullptr /* arena */, true /* use_range_del_table */);
cache->tombstones.reset(new FragmentedRangeTombstoneList( cache->tombstones.reset(new FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator>(unfragmented_iter), std::unique_ptr<InternalIterator>(unfragmented_iter),
comparator_.comparator)); comparator_.comparator));
@ -600,9 +618,9 @@ void MemTable::ConstructFragmentedRangeTombstones() {
// There should be no concurrent Construction // There should be no concurrent Construction
if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
// TODO: plumb Env::IOActivity, Env::IOPriority // TODO: plumb Env::IOActivity, Env::IOPriority
auto* unfragmented_iter = auto* unfragmented_iter = new MemTableIterator(
new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/,
true /* use_range_del_table */); nullptr /* arena */, true /* use_range_del_table */);
fragmented_range_tombstone_list_ = fragmented_range_tombstone_list_ =
std::make_unique<FragmentedRangeTombstoneList>( std::make_unique<FragmentedRangeTombstoneList>(

View file

@ -20,6 +20,7 @@
#include "db/kv_checksum.h" #include "db/kv_checksum.h"
#include "db/range_tombstone_fragmenter.h" #include "db/range_tombstone_fragmenter.h"
#include "db/read_callback.h" #include "db/read_callback.h"
#include "db/seqno_to_time_mapping.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "memory/allocator.h" #include "memory/allocator.h"
#include "memory/concurrent_arena.h" #include "memory/concurrent_arena.h"
@ -28,6 +29,7 @@
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "table/multiget_context.h" #include "table/multiget_context.h"
#include "util/cast_util.h"
#include "util/dynamic_bloom.h" #include "util/dynamic_bloom.h"
#include "util/hash.h" #include "util/hash.h"
#include "util/hash_containers.h" #include "util/hash_containers.h"
@ -203,7 +205,11 @@ class MemTable {
// arena: If not null, the arena needs to be used to allocate the Iterator. // arena: If not null, the arena needs to be used to allocate the Iterator.
// Calling ~Iterator of the iterator will destroy all the states but // Calling ~Iterator of the iterator will destroy all the states but
// those allocated in arena. // those allocated in arena.
InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); // seqno_to_time_mapping: it's used to support return write unix time for the
// data, currently only needed for iterators serving user reads.
InternalIterator* NewIterator(
const ReadOptions& read_options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena);
// Returns an iterator that yields the range tombstones of the memtable. // Returns an iterator that yields the range tombstones of the memtable.
// The caller must ensure that the underlying MemTable remains live // The caller must ensure that the underlying MemTable remains live

View file

@ -211,18 +211,22 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
} }
void MemTableListVersion::AddIterators( void MemTableListVersion::AddIterators(
const ReadOptions& options, std::vector<InternalIterator*>* iterator_list, const ReadOptions& options,
Arena* arena) { UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
std::vector<InternalIterator*>* iterator_list, Arena* arena) {
for (auto& m : memlist_) { for (auto& m : memlist_) {
iterator_list->push_back(m->NewIterator(options, arena)); iterator_list->push_back(
m->NewIterator(options, seqno_to_time_mapping, arena));
} }
} }
void MemTableListVersion::AddIterators(const ReadOptions& options, void MemTableListVersion::AddIterators(
MergeIteratorBuilder* merge_iter_builder, const ReadOptions& options,
bool add_range_tombstone_iter) { UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) {
for (auto& m : memlist_) { for (auto& m : memlist_) {
auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena()); auto mem_iter = m->NewIterator(options, seqno_to_time_mapping,
merge_iter_builder->GetArena());
if (!add_range_tombstone_iter || options.ignore_range_deletions) { if (!add_range_tombstone_iter || options.ignore_range_deletions) {
merge_iter_builder->AddIterator(mem_iter); merge_iter_builder->AddIterator(mem_iter);
} else { } else {

View file

@ -112,10 +112,12 @@ class MemTableListVersion {
RangeDelAggregator* range_del_agg); RangeDelAggregator* range_del_agg);
void AddIterators(const ReadOptions& options, void AddIterators(const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
std::vector<InternalIterator*>* iterator_list, std::vector<InternalIterator*>* iterator_list,
Arena* arena); Arena* arena);
void AddIterators(const ReadOptions& options, void AddIterators(const ReadOptions& options,
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
MergeIteratorBuilder* merge_iter_builder, MergeIteratorBuilder* merge_iter_builder,
bool add_range_tombstone_iter); bool add_range_tombstone_iter);

View file

@ -443,7 +443,8 @@ class Repairer {
ReadOptions ro; ReadOptions ro;
ro.total_order_seek = true; ro.total_order_seek = true;
Arena arena; Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); ScopedArenaIterator iter(
mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
int64_t _current_time = 0; int64_t _current_time = 0;
immutable_db_options_.clock->GetCurrentTime(&_current_time) immutable_db_options_.clock->GetCurrentTime(&_current_time)
.PermitUncheckedError(); // ignore error .PermitUncheckedError(); // ignore error

View file

@ -59,7 +59,8 @@ static std::string PrintContents(WriteBatch* b,
std::unique_ptr<InternalIterator> iter_guard; std::unique_ptr<InternalIterator> iter_guard;
InternalIterator* iter; InternalIterator* iter;
if (i == 0) { if (i == 0) {
iter = mem->NewIterator(ReadOptions(), &arena); iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
&arena);
arena_iter_guard.set(iter); arena_iter_guard.set(iter);
} else { } else {
iter = mem->NewRangeTombstoneIterator(ReadOptions(), iter = mem->NewRangeTombstoneIterator(ReadOptions(),

View file

@ -137,15 +137,18 @@ class Iterator : public Cleanable {
// Get the user-key portion of the internal key at which the iteration // Get the user-key portion of the internal key at which the iteration
// stopped. // stopped.
// Property "rocksdb.iterator.write-time": // Property "rocksdb.iterator.write-time":
// DO NOT USE, UNDER CONSTRUCTION
// Get the unix time of the best estimate of the write time of the entry. // Get the unix time of the best estimate of the write time of the entry.
// Returned as 64-bit raw value (8 bytes). It can be converted to uint64_t // Returned as 64-bit raw value (8 bytes). It can be converted to uint64_t
// with util method `DecodeU64Ts`. The accuracy of the write time depends on // with util method `DecodeU64Ts`. The accuracy of the write time depends on
// settings like preserve_internal_time_seconds. If this feature is // settings like preserve_internal_time_seconds. The actual write time of
// disabled, this property will always be empty. The actual write time of
// the entry should be the same or newer than the returned write time. So // the entry should be the same or newer than the returned write time. So
// this property can be interpreted as the possible oldest write time for // this property can be interpreted as the possible oldest write time for
// the entry. // the entry.
// If the seqno to time mapping recording is not enabled,
// std::numeric_limits<uint64_t>::max() will be returned to indicate the
// write time is unknown. For data entry whose sequence number has
// been zeroed out (possible when they reach the last level), 0 is returned
// no matter whether the seqno to time recording feature is enabled or not.
virtual Status GetProperty(std::string prop_name, std::string* prop); virtual Status GetProperty(std::string prop_name, std::string* prop);
virtual Slice timestamp() const { virtual Slice timestamp() const {

View file

@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
unsigned int count = 0; unsigned int count = 0;
ROCKSDB_NAMESPACE::Arena arena; ROCKSDB_NAMESPACE::Arena arena;
ROCKSDB_NAMESPACE::ScopedArenaIterator iter( ROCKSDB_NAMESPACE::ScopedArenaIterator iter(
mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), &arena)); mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(),
/*seqno_to_time_mapping=*/nullptr, &arena));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ROCKSDB_NAMESPACE::ParsedInternalKey ikey; ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
ikey.clear(); ikey.clear();

View file

@ -9,6 +9,7 @@
#pragma once #pragma once
#include <deque> #include <deque>
#include "db/seqno_to_time_mapping.h"
#include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader.h"
#include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_based_table_reader_impl.h"
#include "table/block_based/block_prefetcher.h" #include "table/block_based/block_prefetcher.h"
@ -92,6 +93,22 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
return const_cast<BlockBasedTableIterator*>(this) return const_cast<BlockBasedTableIterator*>(this)
->MaterializeCurrentBlock(); ->MaterializeCurrentBlock();
} }
uint64_t write_unix_time() const override {
assert(Valid());
// TODO(yuzhangyu): if value type is kTypeValuePreferredSeqno,
// parse its unix write time out of packed value.
const SeqnoToTimeMapping& seqno_to_time_mapping =
table_->GetSeqnoToTimeMapping();
SequenceNumber seqno = ExtractSequenceNumber(key());
if (kUnknownSeqnoBeforeAll == seqno) {
return kUnknownTimeBeforeAll;
} else if (seqno_to_time_mapping.Empty()) {
return std::numeric_limits<uint64_t>::max();
}
return seqno_to_time_mapping.GetProximalTimeBeforeSeqno(seqno);
}
Slice value() const override { Slice value() const override {
// PrepareValue() must have been called. // PrepareValue() must have been called.
assert(!is_at_first_key_from_index_); assert(!is_at_first_key_from_index_);

View file

@ -921,6 +921,17 @@ Status BlockBasedTable::ReadPropertiesBlock(
} else { } else {
assert(table_properties != nullptr); assert(table_properties != nullptr);
rep_->table_properties = std::move(table_properties); rep_->table_properties = std::move(table_properties);
if (s.ok()) {
s = rep_->seqno_to_time_mapping.DecodeFrom(
rep_->table_properties->seqno_to_time_mapping);
}
if (!s.ok()) {
ROCKS_LOG_WARN(
rep_->ioptions.logger,
"Problem reading or processing seqno-to-time mapping: %s",
s.ToString().c_str());
}
rep_->blocks_maybe_compressed = rep_->blocks_maybe_compressed =
rep_->table_properties->compression_name != rep_->table_properties->compression_name !=
CompressionTypeToString(kNoCompression); CompressionTypeToString(kNoCompression);
@ -1233,6 +1244,10 @@ std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
return rep_->table_properties; return rep_->table_properties;
} }
const SeqnoToTimeMapping& BlockBasedTable::GetSeqnoToTimeMapping() const {
return rep_->seqno_to_time_mapping;
}
size_t BlockBasedTable::ApproximateMemoryUsage() const { size_t BlockBasedTable::ApproximateMemoryUsage() const {
size_t usage = 0; size_t usage = 0;
if (rep_) { if (rep_) {

View file

@ -16,6 +16,7 @@
#include "cache/cache_key.h" #include "cache/cache_key.h"
#include "cache/cache_reservation_manager.h" #include "cache/cache_reservation_manager.h"
#include "db/range_tombstone_fragmenter.h" #include "db/range_tombstone_fragmenter.h"
#include "db/seqno_to_time_mapping.h"
#include "file/filename.h" #include "file/filename.h"
#include "rocksdb/slice_transform.h" #include "rocksdb/slice_transform.h"
#include "rocksdb/table_properties.h" #include "rocksdb/table_properties.h"
@ -197,6 +198,8 @@ class BlockBasedTable : public TableReader {
std::shared_ptr<const TableProperties> GetTableProperties() const override; std::shared_ptr<const TableProperties> GetTableProperties() const override;
const SeqnoToTimeMapping& GetSeqnoToTimeMapping() const;
size_t ApproximateMemoryUsage() const override; size_t ApproximateMemoryUsage() const override;
// convert SST file to a human readable form // convert SST file to a human readable form
@ -607,6 +610,7 @@ struct BlockBasedTable::Rep {
BlockHandle compression_dict_handle; BlockHandle compression_dict_handle;
std::shared_ptr<const TableProperties> table_properties; std::shared_ptr<const TableProperties> table_properties;
SeqnoToTimeMapping seqno_to_time_mapping;
BlockHandle index_handle; BlockHandle index_handle;
BlockBasedTableOptions::IndexType index_type; BlockBasedTableOptions::IndexType index_type;
bool whole_key_filtering; bool whole_key_filtering;

View file

@ -116,6 +116,14 @@ class InternalIteratorBase : public Cleanable {
// REQUIRES: Valid() // REQUIRES: Valid()
virtual Slice key() const = 0; virtual Slice key() const = 0;
// Returns the approximate write time of this entry, which is deduced from
// sequence number if sequence number to time mapping is available.
// The default implementation returns maximum uint64_t and that indicates the
// write time is unknown.
virtual uint64_t write_unix_time() const {
return std::numeric_limits<uint64_t>::max();
}
// Return user key for the current entry. // Return user key for the current entry.
// REQUIRES: Valid() // REQUIRES: Valid()
virtual Slice user_key() const { return ExtractUserKey(key()); } virtual Slice user_key() const { return ExtractUserKey(key()); }

View file

@ -82,6 +82,12 @@ class IteratorWrapperBase {
assert(Valid()); assert(Valid());
return result_.key; return result_.key;
} }
uint64_t write_unix_time() const {
assert(Valid());
return iter_->write_unix_time();
}
TValue value() const { TValue value() const {
assert(Valid()); assert(Valid());
return iter_->value(); return iter_->value();

View file

@ -430,6 +430,11 @@ class MergingIterator : public InternalIterator {
return current_->key(); return current_->key();
} }
uint64_t write_unix_time() const override {
assert(Valid());
return current_->write_unix_time();
}
Slice value() const override { Slice value() const override {
assert(Valid()); assert(Valid());
return current_->value(); return current_->value();

View file

@ -534,7 +534,9 @@ class MemTableConstructor : public Constructor {
InternalIterator* NewIterator( InternalIterator* NewIterator(
const SliceTransform* /*prefix_extractor*/) const override { const SliceTransform* /*prefix_extractor*/) const override {
return new KeyConvertingIterator( return new KeyConvertingIterator(
memtable_->NewIterator(ReadOptions(), &arena_), true); memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
&arena_),
true);
} }
bool AnywayDeleteIterator() const override { return true; } bool AnywayDeleteIterator() const override { return true; }
@ -4897,7 +4899,8 @@ TEST_F(MemTableTest, Simple) {
std::unique_ptr<InternalIterator> iter_guard; std::unique_ptr<InternalIterator> iter_guard;
InternalIterator* iter; InternalIterator* iter;
if (i == 0) { if (i == 0) {
iter = GetMemTable()->NewIterator(ReadOptions(), &arena); iter = GetMemTable()->NewIterator(
ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena);
arena_iter_guard.set(iter); arena_iter_guard.set(iter);
} else { } else {
iter = GetMemTable()->NewRangeTombstoneIterator( iter = GetMemTable()->NewRangeTombstoneIterator(