rocksdb/db/seqno_time_test.cc

1380 lines
49 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/db_test_util.h"
#include "db/periodic_task_scheduler.h"
#include "db/seqno_to_time_mapping.h"
#include "port/stack_trace.h"
#include "rocksdb/iostats_context.h"
#include "rocksdb/utilities/debug.h"
#include "test_util/mock_time_env.h"
namespace ROCKSDB_NAMESPACE {
class SeqnoTimeTest : public DBTestBase {
public:
SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) {
mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
mock_clock_->SetCurrentTime(kMockStartTime);
mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
}
protected:
std::unique_ptr<Env> mock_env_;
std::shared_ptr<MockSystemClock> mock_clock_;
// Sufficient starting time that preserve time doesn't under-flow into
// pre-history
static constexpr uint32_t kMockStartTime = 10000000;
void SetUp() override {
mock_clock_->InstallTimedWaitFixCallback();
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
auto periodic_task_scheduler_ptr =
reinterpret_cast<PeriodicTaskScheduler*>(arg);
periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
});
mock_clock_->SetCurrentTime(kMockStartTime);
}
// make sure the file is not in cache, otherwise it won't have IO info
void AssertKeyTemperature(int key_id, Temperature expected_temperature) {
get_iostats_context()->Reset();
IOStatsContext* iostats = get_iostats_context();
std::string result = Get(Key(key_id));
ASSERT_FALSE(result.empty());
ASSERT_GT(iostats->bytes_read, 0);
switch (expected_temperature) {
case Temperature::kUnknown:
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
case Temperature::kCold:
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
default:
// the test only support kCold now for the bottommost temperature
FAIL();
}
}
};
TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
const int kKeyPerSec = 10;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
DestroyAndReopen(options);
int sst_num = 0;
// Write files that are overlap and enough to trigger compaction
for (; sst_num < kNumTrigger; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKeyTemperature(20, Temperature::kUnknown);
// Write more data, but still all hot until the 10th SST, as:
// write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
// The preclude_last_level_data_seconds is 10k
for (; sst_num < kNumTrigger * 2; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
}
// Now we have both hot data and cold data
for (; sst_num < kNumTrigger * 3; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKeyTemperature(20, Temperature::kCold);
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
});
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// the hot/cold data cut off range should be between i * 20 + 200 -> 250
AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown);
AssertKeyTemperature(i * 20 + 200, Temperature::kCold);
}
ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size);
// Wait again, the most of the data should be cold after that
// but it may not be all cold, because if there's no new data write to SST,
// the compaction will not get the new seqno->time sampling to decide the last
// a few data's time.
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
// any random data close to the end should be cold
AssertKeyTemperature(1000, Temperature::kCold);
// close explicitly, because the env is local variable which will be released
// first.
Close();
}
TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
options.level_compaction_dynamic_level_bytes = true;
// TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
// the penultimate level score > 1, but the hot is not cold enough to compact
// to last level, which will keep triggering compaction.
options.disable_auto_compactions = true;
DestroyAndReopen(options);
int sst_num = 0;
// Write files that are overlap
for (; sst_num < 4; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKeyTemperature(20, Temperature::kUnknown);
// Adding more data to have mixed hot and cold data
for (; sst_num < 14; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
// Second to last level
MoveFilesToLevel(5);
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// Compact the files to the last level which should split the hot/cold data
MoveFilesToLevel(6);
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKeyTemperature(20, Temperature::kCold);
// Wait some time, with each wait, the cold data is increasing and hot data is
// decreasing
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t pre_hot = hot_data_size;
uint64_t pre_cold = cold_data_size;
hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_LT(hot_data_size, pre_hot);
ASSERT_GT(cold_data_size, pre_cold);
// the hot/cold cut_off key should be around i * 20 + 400 -> 450
AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown);
AssertKeyTemperature(i * 20 + 400, Temperature::kCold);
}
// Wait again, the most of the data should be cold after that
// hot data might not be empty, because if we don't write new data, there's
// no seqno->time sampling available to the compaction
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
// any random data close to the end should be cold
AssertKeyTemperature(1000, Temperature::kCold);
Close();
}
enum class SeqnoTimeTestType : char {
kTrackInternalTimeSeconds = 0,
kPrecludeLastLevel = 1,
kBothSetTrackSmaller = 2,
};
class SeqnoTimeTablePropTest
: public SeqnoTimeTest,
public ::testing::WithParamInterface<SeqnoTimeTestType> {
public:
SeqnoTimeTablePropTest() : SeqnoTimeTest() {}
void SetTrackTimeDurationOptions(uint64_t track_time_duration,
Options& options) const {
// either option set will enable the time tracking feature
switch (GetParam()) {
case SeqnoTimeTestType::kTrackInternalTimeSeconds:
options.preclude_last_level_data_seconds = 0;
options.preserve_internal_time_seconds = track_time_duration;
break;
case SeqnoTimeTestType::kPrecludeLastLevel:
options.preclude_last_level_data_seconds = track_time_duration;
options.preserve_internal_time_seconds = 0;
break;
case SeqnoTimeTestType::kBothSetTrackSmaller:
options.preclude_last_level_data_seconds = track_time_duration;
options.preserve_internal_time_seconds = track_time_duration / 10;
break;
}
}
};
INSTANTIATE_TEST_CASE_P(
SeqnoTimeTablePropTest, SeqnoTimeTablePropTest,
::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds,
SeqnoTimeTestType::kPrecludeLastLevel,
SeqnoTimeTestType::kBothSetTrackSmaller));
TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) {
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.env = mock_env_.get();
options.disable_auto_compactions = true;
DestroyAndReopen(options);
std::set<uint64_t> checked_file_nums;
SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber() + 1;
uint64_t start_time = mock_clock_->NowSeconds();
// Write a key every 10 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
// about ~20 seqs->time entries, because the sample rate is 10000/100, and it
// passes 2k time. Add (roughly) one for starting entry.
ASSERT_GE(seqs.size(), 20);
ASSERT_LE(seqs.size(), 22);
SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber() + 1;
for (auto i = start_seq; i < seq_end; i++) {
// The result is within the range
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 10 - 100);
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 10);
}
checked_file_nums.insert(it->second->orig_file_number);
start_seq = seq_end;
start_time = mock_clock_->NowSeconds();
// Write a key every 1 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 190), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); });
}
seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 2);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
// There only a few time sample
ASSERT_GE(seqs.size(), 1);
ASSERT_LE(seqs.size(), 3);
for (auto i = start_seq; i < seq_end; i++) {
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) - 100);
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq));
}
checked_file_nums.insert(it->second->orig_file_number);
start_seq = seq_end;
start_time = mock_clock_->NowSeconds();
// Write a key every 200 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 380), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
}
seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 3);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
// The sequence number -> time entries should be maxed
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (auto i = start_seq; i < seq_end; i++) {
// aged out entries allowed to report time=0
if ((seq_end - i) * 200 <= 10000) {
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 200 - 100);
}
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 200);
}
checked_file_nums.insert(it->second->orig_file_number);
start_seq = seq_end;
start_time = mock_clock_->NowSeconds();
// Write a key every 100 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 570), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 4);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
checked_file_nums.insert(it->second->orig_file_number);
// re-enable compaction
ASSERT_OK(dbfull()->SetOptions({
{"disable_auto_compactions", "false"},
}));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_GE(tables_props.size(), 1);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (auto i = start_seq; i < seq_end; i++) {
// aged out entries allowed to report time=0
// FIXME: should be <=
if ((seq_end - i) * 100 < 10000) {
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 100 - 100);
}
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 100);
}
ASSERT_OK(db_->Close());
}
TEST_P(SeqnoTimeTablePropTest, MultiCFs) {
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 0;
options.preserve_internal_time_seconds = 0;
options.env = mock_env_.get();
options.stats_dump_period_sec = 0;
options.stats_persist_period_sec = 0;
ReopenWithColumnFamilies({"default"}, options);
const PeriodicTaskScheduler& scheduler =
dbfull()->TEST_GetPeriodicTaskScheduler();
ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
// Write some data and increase the current time
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
Options options_1 = options;
SetTrackTimeDurationOptions(10000, options_1);
CreateColumnFamilies({"one"}, options_1);
ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
// Write some data to the default CF (without preclude_last_level feature)
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
// Write some data to the CF one
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(1, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush(1));
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 1);
ASSERT_LE(seqs.size(), 4);
// Create one more CF with larger preclude_last_level time
Options options_2 = options;
SetTrackTimeDurationOptions(1000000, options_2); // 1m
CreateColumnFamilies({"two"}, options_2);
// Add more data to CF "two" to fill the in memory mapping
for (int i = 0; i < 2000; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 1000 - 1);
ASSERT_LE(seqs.size(), 1000 + 1);
ASSERT_OK(Flush(2));
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
// the max encoded entries is 100
ASSERT_GE(seqs.size(), 100 - 1);
ASSERT_LE(seqs.size(), 100 + 1);
// Write some data to default CF, as all memtable with preclude_last_level
// enabled have flushed, the in-memory seqno->time mapping should be cleared
for (int i = 0; i < 10; i++) {
ASSERT_OK(Put(0, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
ASSERT_OK(Flush(0));
// trigger compaction for CF "two" and make sure the compaction output has
// seqno_to_time_mapping
for (int j = 0; j < 3; j++) {
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush(2));
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(0, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush(0));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
// Write some data to CF "two", but don't flush to accumulate
for (int i = 0; i < 1000; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_GE(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
500);
// After dropping CF "one", the in-memory mapping will be change to only
// follow CF "two" options.
ASSERT_OK(db_->DropColumnFamily(handles_[1]));
ASSERT_LE(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
100 + 5);
// After dropping CF "two", the in-memory mapping is also clear.
ASSERT_OK(db_->DropColumnFamily(handles_[2]));
ASSERT_EQ(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
0);
// And the timer worker is stopped
ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
Close();
}
TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
const int kInstanceNum = 2;
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.env = mock_env_.get();
options.stats_dump_period_sec = 0;
options.stats_persist_period_sec = 0;
auto dbs = std::vector<DB*>(kInstanceNum);
for (int i = 0; i < kInstanceNum; i++) {
ASSERT_OK(
DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
}
// Make sure the second instance has the worker enabled
auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
WriteOptions wo;
for (int i = 0; i < 200; i++) {
ASSERT_OK(dbi->Put(wo, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping();
ASSERT_GT(seqno_to_time_mapping.Size(), 10);
for (int i = 0; i < kInstanceNum; i++) {
ASSERT_OK(dbs[i]->Close());
delete dbs[i];
}
}
TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.compaction_style = kCompactionStyleUniversal;
options.num_levels = kNumLevels;
options.env = mock_env_.get();
DestroyAndReopen(options);
std::atomic_uint64_t num_seqno_zeroing{0};
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"CompactionIterator::PrepareOutput:ZeroingSeq",
[&](void* /*arg*/) { num_seqno_zeroing++; });
SyncPoint::GetInstance()->EnableProcessing();
int sst_num = 0;
for (; sst_num < kNumTrigger - 1; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 3);
for (const auto& props : tables_props) {
ASSERT_FALSE(props.second->seqno_to_time_mapping.empty());
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
// Add (roughly) one for starting entry.
ASSERT_GE(seqs.size(), 10);
ASSERT_LE(seqs.size(), 10 + 2);
}
// Trigger a compaction
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
sst_num++;
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
// compact to the last level
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// make sure the data is all compacted to penultimate level if the feature is
// on, otherwise, compacted to the last level.
if (options.preclude_last_level_data_seconds > 0) {
ASSERT_GT(NumTableFilesAtLevel(5), 0);
ASSERT_EQ(NumTableFilesAtLevel(6), 0);
} else {
ASSERT_EQ(NumTableFilesAtLevel(5), 0);
ASSERT_GT(NumTableFilesAtLevel(6), 0);
}
// regardless the file is on the last level or not, it should keep the time
// information and sequence number are not set
tables_props.clear();
tp_mapping.Clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
ASSERT_EQ(num_seqno_zeroing, 0);
it = tables_props.begin();
ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
// make half of the data expired
mock_clock_->MockSleepForSeconds(static_cast<int>(8000));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
tables_props.clear();
tp_mapping.Clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
if (options.preclude_last_level_data_seconds > 0) {
ASSERT_EQ(tables_props.size(), 2);
} else {
ASSERT_EQ(tables_props.size(), 1);
}
ASSERT_GT(num_seqno_zeroing, 0);
std::vector<KeyVersion> key_versions;
ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
std::numeric_limits<size_t>::max(),
&key_versions));
// make sure there're more than 300 keys and first 100 keys are having seqno
// zeroed out, the last 100 key seqno not zeroed out
ASSERT_GT(key_versions.size(), 300);
for (int i = 0; i < 100; i++) {
ASSERT_EQ(key_versions[i].sequence, 0);
}
auto rit = key_versions.rbegin();
for (int i = 0; i < 100; i++) {
ASSERT_GT(rit->sequence, 0);
rit++;
}
// make all data expired and compact again to push it to the last level
// regardless if the tiering feature is enabled or not
mock_clock_->MockSleepForSeconds(static_cast<int>(20000));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_GT(num_seqno_zeroing, 0);
ASSERT_GT(NumTableFilesAtLevel(6), 0);
Close();
}
TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) {
Options base_options = CurrentOptions();
base_options.env = mock_env_.get();
base_options.disable_auto_compactions = true;
base_options.create_missing_column_families = true;
Options track_options = base_options;
constexpr uint32_t kPreserveSecs = 1234567;
SetTrackTimeDurationOptions(kPreserveSecs, track_options);
SeqnoToTimeMapping sttm;
SequenceNumber latest_seqno;
uint64_t start_time, end_time;
// #### DB#1, #2: No pre-population without preserve/preclude ####
// #### But a single entry is added when preserve/preclude enabled ####
for (bool with_write : {false, true}) {
SCOPED_TRACE("with_write=" + std::to_string(with_write));
DestroyAndReopen(base_options);
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_TRUE(sttm.Empty());
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
if (with_write) {
// Ensure that writes before new CF with preserve/preclude option don't
// interfere with the seqno-to-time mapping getting a starting entry.
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
} else {
// FIXME: currently, starting entry after CreateColumnFamily requires
// non-zero seqno
ASSERT_OK(Delete("blah"));
}
// Unfortunately, if we add a CF with preserve/preclude option after
// open, that does not reserve seqnos with pre-populated time mappings.
CreateColumnFamilies({"one"}, track_options);
// No pre-population (unfortunately), just a single starting entry
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), 1);
ASSERT_EQ(latest_seqno, 1U);
// Current time maps to starting entry / seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U);
// Any older times are unknown.
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1),
kUnknownSeqnoBeforeAll);
// Now check that writes can proceed normally (passing about 20% of preserve
// time)
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kPreserveSecs / 99));
});
}
ASSERT_OK(Flush());
// Check that mappings are getting populated
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
end_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), 21);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1),
kUnknownSeqnoBeforeAll);
}
// ### DB#3, #4: Read-only DB with preserve/preclude after not ####
// Make sure we don't hit issues with read-only DBs, which don't need
// the mapping in the DB state (though it wouldn't hurt anything)
for (bool with_write : {false, true}) {
SCOPED_TRACE("with_write=" + std::to_string(with_write));
DestroyAndReopen(base_options);
if (with_write) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
}
ASSERT_OK(ReadOnlyReopen(base_options));
if (with_write) {
ASSERT_EQ(Get("foo"), "bar");
}
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
if (!with_write) {
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
}
ASSERT_OK(ReadOnlyReopen(track_options));
if (with_write) {
ASSERT_EQ(Get("foo"), "bar");
}
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
if (!with_write) {
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
// And even if we re-open read-write, we do not get pre-population,
// because that's only for new DBs.
Reopen(track_options);
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
}
}
// #### DB#5: Destroy and open with preserve/preclude option ####
DestroyAndReopen(track_options);
// Ensure pre-population
constexpr auto kPrePopPairs = SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST;
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
// One nono-zero sequence number per pre-populated pair (this could be
// revised if we want to use interpolation for better approximate time
// mappings with no guarantee of erring in just one direction).
ASSERT_EQ(latest_seqno, kPrePopPairs);
// Current time maps to last pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno);
// Oldest tracking time maps to first pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1);
// In more detail, check that estimated seqnos (pre-allocated) are uniformly
// spread over the tracked time.
for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) {
// Round up query time
uint64_t t = start_time - kPreserveSecs +
static_cast<uint64_t>(ratio * kPreserveSecs + 0.9999999);
// Round down estimated seqno
SequenceNumber s =
static_cast<SequenceNumber>(ratio * (latest_seqno - 1)) + 1;
// Match
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(t), s);
}
// Now check that writes can proceed normally (passing about 20% of preserve
// time)
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kPreserveSecs / 99));
});
}
ASSERT_OK(Flush());
// Can still see some pre-populated mappings, though some displaced
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
end_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs / 2),
kPrePopPairs / 2);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs),
kUnknownSeqnoBeforeAll);
// Make sure we don't hit issues with read-only DBs, which don't need
// the mapping in the DB state (though it wouldn't hurt anything)
ASSERT_OK(ReadOnlyReopen(track_options));
ASSERT_EQ(Get(Key(0)), "value");
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
// #### DB#6: Destroy and open+create an extra CF with preserve/preclude ####
// (default CF does not have the option)
Destroy(track_options);
ReopenWithColumnFamilies({"default", "one"},
List({base_options, track_options}));
// Ensure pre-population (not as exhaustive checking here)
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
// One nono-zero sequence number per pre-populated pair (this could be
// revised if we want to use interpolation for better approximate time
// mappings with no guarantee of erring in just one direction).
ASSERT_EQ(latest_seqno, kPrePopPairs);
// Current time maps to last pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno);
// Oldest tracking time maps to first pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1);
// Even after no writes and DB re-open without tracking options, sequence
// numbers should not go backward into those that were pre-allocated.
// (Future work: persist the mapping)
ReopenWithColumnFamilies({"default", "one"},
List({base_options, base_options}));
ASSERT_EQ(latest_seqno, db_->GetLatestSequenceNumber());
Close();
}
TEST_F(SeqnoTimeTest, MappingAppend) {
SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
// ignore seqno == 0, as it may mean the seqno is zeroed out
ASSERT_FALSE(test.Append(0, 9));
ASSERT_TRUE(test.Append(3, 10));
auto size = test.Size();
// normal add
ASSERT_TRUE(test.Append(10, 11));
size++;
ASSERT_EQ(size, test.Size());
// Append unsorted
ASSERT_FALSE(test.Append(8, 12));
ASSERT_EQ(size, test.Size());
// Append with the same seqno, newer time is rejected because that makes
// GetProximalSeqnoBeforeTime queries worse (see later test)
ASSERT_FALSE(test.Append(10, 12));
ASSERT_EQ(size, test.Size());
// older time will be ignored
ASSERT_FALSE(test.Append(10, 9));
ASSERT_EQ(size, test.Size());
// new seqno with old time will be ignored
ASSERT_FALSE(test.Append(12, 8));
ASSERT_EQ(size, test.Size());
// new seqno with same time is accepted by replacing last entry
// (improves GetProximalSeqnoBeforeTime queries without blowing up size)
ASSERT_TRUE(test.Append(12, 11));
ASSERT_EQ(size, test.Size());
}
TEST_F(SeqnoTimeTest, ProximalFunctions) {
SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U),
kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U),
kUnknownSeqnoBeforeAll);
// (Taken from example in SeqnoToTimeMapping class comment)
// Time 500 is after seqno 10 and before seqno 11
EXPECT_TRUE(test.Append(10, 500));
// Seqno too early
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(9), kUnknownTimeBeforeAll);
// We only know that 500 is after 10
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
// Found
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 500U);
// Time too early
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll);
// Found
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 10U);
// More samples
EXPECT_TRUE(test.Append(20, 600));
EXPECT_TRUE(test.Append(30, 700));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(20), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(21), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(601), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(701), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 30U);
// Redundant sample ignored
EXPECT_EQ(test.Size(), 3U);
EXPECT_FALSE(test.Append(30, 700));
EXPECT_EQ(test.Size(), 3U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// Later sample with same seqno is ignored, to provide best results
// for GetProximalSeqnoBeforeTime function while saving entries
// in SeqnoToTimeMapping.
EXPECT_FALSE(test.Append(30, 800));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
// Could return 800, but saving space in SeqnoToTimeMapping instead.
// Can reconsider if/when GetProximalTimeBeforeSeqno is used in
// production.
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
// If the existing {30, 700} entry were replaced with {30, 800}, this
// would return seqno 20 instead of 30, which would preclude more than
// necessary for "preclude_last_level_data_seconds" feature.
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(800), 30U);
// Still OK
EXPECT_TRUE(test.Append(40, 900));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(41), 900U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 40U);
// Burst of writes during a short time creates an opportunity
// for better results from GetProximalSeqnoBeforeTime(), at the
// expense of GetProximalTimeBeforeSeqno().
EXPECT_TRUE(test.Append(50, 900));
// These are subject to later revision depending on priorities
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(49), 700U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(51), 900U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 50U);
}
TEST_F(SeqnoTimeTest, PrePopulate) {
SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
EXPECT_EQ(test.Size(), 0U);
// Smallest case is like two Appends
test.PrePopulate(10, 11, 500, 600);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(12), 600U);
test.Clear();
// Populate a small range
uint64_t kTimeIncrement = 1234567;
test.PrePopulate(1, 12, kTimeIncrement, kTimeIncrement * 2);
for (uint64_t i = 0; i <= 12; ++i) {
// NOTE: with 1 and 12 as the pre-populated end points, the duration is
// broken into 11 equal(-ish) spans
uint64_t t = kTimeIncrement + (i * kTimeIncrement) / 11 - 1;
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), i);
}
test.Clear();
// Populate an excessively large range (in the future we might want to
// interpolate estimated times for seqnos between entries)
test.PrePopulate(1, 34567, kTimeIncrement, kTimeIncrement * 2);
for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) {
// Round up query time
uint64_t t = kTimeIncrement +
static_cast<uint64_t>(ratio * kTimeIncrement + 0.9999999);
// Round down estimated seqno
SequenceNumber s = static_cast<SequenceNumber>(ratio * (34567 - 1)) + 1;
// Match
// TODO: for now this is exact, but in the future might need approximation
// bounds to account for limited samples.
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), s);
}
}
TEST_F(SeqnoTimeTest, TruncateOldEntries) {
constexpr uint64_t kMaxTimeDuration = 42;
SeqnoToTimeMapping test(kMaxTimeDuration, /*max_capacity=*/10);
EXPECT_EQ(test.Size(), 0U);
// Safe on empty mapping
test.TruncateOldEntries(500);
EXPECT_EQ(test.Size(), 0U);
// (Taken from example in SeqnoToTimeMapping class comment)
// Time 500 is after seqno 10 and before seqno 11
EXPECT_TRUE(test.Append(10, 500));
EXPECT_TRUE(test.Append(20, 600));
EXPECT_TRUE(test.Append(30, 700));
EXPECT_TRUE(test.Append(40, 800));
EXPECT_TRUE(test.Append(50, 900));
EXPECT_EQ(test.Size(), 5U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// etc.
// Must keep first entry
test.TruncateOldEntries(500 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 5U);
test.TruncateOldEntries(599 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 5U);
// Purges first entry
test.TruncateOldEntries(600 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 4U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// No effect
test.TruncateOldEntries(600 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 4U);
test.TruncateOldEntries(699 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 4U);
// Purges next two
test.TruncateOldEntries(899 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 2U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(799), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 40U);
// Always keep last entry, to have a non-trivial seqno bound
test.TruncateOldEntries(10000000);
EXPECT_EQ(test.Size(), 1U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(10000000), 50U);
}
TEST_F(SeqnoTimeTest, Sort) {
SeqnoToTimeMapping test;
// single entry
test.Add(10, 11);
ASSERT_OK(test.Sort());
ASSERT_EQ(test.Size(), 1);
// duplicate, should be removed by sort
test.Add(10, 11);
// same seqno, but older time, should be removed
test.Add(10, 9);
// unuseful ones, should be removed by sort
test.Add(11, 9);
test.Add(9, 8);
// Good ones
test.Add(1, 10);
test.Add(100, 100);
ASSERT_OK(test.Sort());
auto seqs = test.TEST_GetInternalMapping();
std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
expected.emplace_back(1, 10);
expected.emplace_back(10, 11);
expected.emplace_back(100, 100);
ASSERT_EQ(expected, seqs);
}
TEST_F(SeqnoTimeTest, EncodeDecodeBasic) {
SeqnoToTimeMapping test(0, 1000);
std::string output;
test.Encode(output, 0, 1000, 100);
ASSERT_TRUE(output.empty());
for (int i = 1; i <= 1000; i++) {
ASSERT_TRUE(test.Append(i, i * 10));
}
test.Encode(output, 0, 1000, 100);
ASSERT_FALSE(output.empty());
SeqnoToTimeMapping decoded;
ASSERT_OK(decoded.Add(output));
ASSERT_OK(decoded.Sort());
ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST);
ASSERT_EQ(test.Size(), 1000);
for (SequenceNumber seq = 0; seq <= 1000; seq++) {
// test has the more accurate time mapping, encode only pick
// kMaxSeqnoTimePairsPerSST number of entries, which is less accurate
uint64_t target_time = test.GetProximalTimeBeforeSeqno(seq);
ASSERT_GE(decoded.GetProximalTimeBeforeSeqno(seq),
target_time < 200 ? 0 : target_time - 200);
ASSERT_LE(decoded.GetProximalTimeBeforeSeqno(seq), target_time);
}
}
TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) {
SeqnoToTimeMapping test(0, 10);
test.Append(1, 10);
test.Append(5, 17);
test.Append(6, 25);
test.Append(8, 30);
std::string output;
test.Encode(output, 1, 10, 0, 3);
SeqnoToTimeMapping decoded;
ASSERT_OK(decoded.Add(output));
ASSERT_OK(decoded.Sort());
ASSERT_EQ(decoded.Size(), 3);
auto seqs = decoded.TEST_GetInternalMapping();
std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
expected.emplace_back(1, 10);
expected.emplace_back(6, 25);
expected.emplace_back(8, 30);
ASSERT_EQ(expected, seqs);
// Add a few large time number
test.Append(10, 100);
test.Append(13, 200);
test.Append(16, 300);
output.clear();
test.Encode(output, 1, 20, 0, 4);
decoded.Clear();
ASSERT_OK(decoded.Add(output));
ASSERT_OK(decoded.Sort());
ASSERT_EQ(decoded.Size(), 4);
expected.clear();
expected.emplace_back(1, 10);
// entry #6, #8 are skipped as they are too close to #1.
// entry #100 is also within skip range, but if it's skipped, there not enough
// number to fill 4 entries, so select it.
expected.emplace_back(10, 100);
expected.emplace_back(13, 200);
expected.emplace_back(16, 300);
seqs = decoded.TEST_GetInternalMapping();
ASSERT_EQ(expected, seqs);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}