rocksdb/db/seqno_time_test.cc
Peter Dillinger d895eb08b3 Fix UB/crash in new SeqnoToTimeMapping::CopyFromSeqnoRange (#12293)
Summary:
After https://github.com/facebook/rocksdb/issues/12253 this function has crashed in the crash test, in its call to `std::copy`. I haven't reproduced the crash directly, but `std::copy` probably has undefined behavior if the starting iterator is after the ending iterator, which was possible. I've fixed the logic to deal with that case and to add an assertion to check that precondition of `std::copy` (which appears can be unchecked by `std::copy` itself even with UBSAN+ASAN).

Also added some unit tests etc. that were unfinished for https://github.com/facebook/rocksdb/issues/12253, and slightly tweak SeqnoToTimeMapping::EnforceMaxTimeSpan handling of zero time span case.

This is intended for patching 8.11.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12293

Test Plan: tests added. Will trigger ~20 runs of the crash test job that saw the crash. https://fburl.com/ci/5iiizvfa

Reviewed By: jowlyzhang

Differential Revision: D53090422

Pulled By: pdillinger

fbshipit-source-id: 69d60b1847d9c7e4ae62b153011c2040405db461
2024-01-25 11:27:15 -08:00

1631 lines
56 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/db_test_util.h"
#include "db/periodic_task_scheduler.h"
#include "db/seqno_to_time_mapping.h"
#include "port/stack_trace.h"
#include "rocksdb/iostats_context.h"
#include "rocksdb/utilities/debug.h"
#include "test_util/mock_time_env.h"
namespace ROCKSDB_NAMESPACE {
class SeqnoTimeTest : public DBTestBase {
public:
SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) {
mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
mock_clock_->SetCurrentTime(kMockStartTime);
mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
}
protected:
std::unique_ptr<Env> mock_env_;
std::shared_ptr<MockSystemClock> mock_clock_;
// Sufficient starting time that preserve time doesn't under-flow into
// pre-history
static constexpr uint32_t kMockStartTime = 10000000;
void SetUp() override {
mock_clock_->InstallTimedWaitFixCallback();
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::StartPeriodicTaskScheduler:Init",
[mock_clock = mock_clock_](void* arg) {
auto periodic_task_scheduler_ptr =
reinterpret_cast<PeriodicTaskScheduler*>(arg);
periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock.get());
});
mock_clock_->SetCurrentTime(kMockStartTime);
}
// make sure the file is not in cache, otherwise it won't have IO info
void AssertKeyTemperature(int key_id, Temperature expected_temperature) {
get_iostats_context()->Reset();
IOStatsContext* iostats = get_iostats_context();
std::string result = Get(Key(key_id));
ASSERT_FALSE(result.empty());
ASSERT_GT(iostats->bytes_read, 0);
switch (expected_temperature) {
case Temperature::kUnknown:
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
case Temperature::kCold:
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
default:
// the test only support kCold now for the bottommost temperature
FAIL();
}
}
};
TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
const int kKeyPerSec = 10;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
DestroyAndReopen(options);
int sst_num = 0;
// Write files that are overlap and enough to trigger compaction
for (; sst_num < kNumTrigger; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKeyTemperature(20, Temperature::kUnknown);
// Write more data, but still all hot until the 10th SST, as:
// write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
// The preclude_last_level_data_seconds is 10k
for (; sst_num < kNumTrigger * 2; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
}
// Now we have both hot data and cold data
for (; sst_num < kNumTrigger * 3; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKeyTemperature(20, Temperature::kCold);
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
});
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// the hot/cold data cut off range should be between i * 20 + 200 -> 250
AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown);
AssertKeyTemperature(i * 20 + 200, Temperature::kCold);
}
ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size);
// Wait again, the most of the data should be cold after that
// but it may not be all cold, because if there's no new data write to SST,
// the compaction will not get the new seqno->time sampling to decide the last
// a few data's time.
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
// any random data close to the end should be cold
AssertKeyTemperature(1000, Temperature::kCold);
// close explicitly, because the env is local variable which will be released
// first.
Close();
}
TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
options.level_compaction_dynamic_level_bytes = true;
// TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
// the penultimate level score > 1, but the hot is not cold enough to compact
// to last level, which will keep triggering compaction.
options.disable_auto_compactions = true;
DestroyAndReopen(options);
int sst_num = 0;
// Write files that are overlap
for (; sst_num < 4; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKeyTemperature(20, Temperature::kUnknown);
// Adding more data to have mixed hot and cold data
for (; sst_num < 14; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
// Second to last level
MoveFilesToLevel(5);
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// Compact the files to the last level which should split the hot/cold data
MoveFilesToLevel(6);
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKeyTemperature(20, Temperature::kCold);
// Wait some time, with each wait, the cold data is increasing and hot data is
// decreasing
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t pre_hot = hot_data_size;
uint64_t pre_cold = cold_data_size;
hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_LT(hot_data_size, pre_hot);
ASSERT_GT(cold_data_size, pre_cold);
// the hot/cold cut_off key should be around i * 20 + 400 -> 450
AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown);
AssertKeyTemperature(i * 20 + 400, Temperature::kCold);
}
// Wait again, the most of the data should be cold after that
// hot data might not be empty, because if we don't write new data, there's
// no seqno->time sampling available to the compaction
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
// any random data close to the end should be cold
AssertKeyTemperature(1000, Temperature::kCold);
Close();
}
enum class SeqnoTimeTestType : char {
kTrackInternalTimeSeconds = 0,
kPrecludeLastLevel = 1,
kBothSetTrackSmaller = 2,
};
class SeqnoTimeTablePropTest
: public SeqnoTimeTest,
public ::testing::WithParamInterface<SeqnoTimeTestType> {
public:
SeqnoTimeTablePropTest() : SeqnoTimeTest() {}
void SetTrackTimeDurationOptions(uint64_t track_time_duration,
Options& options) const {
// either option set will enable the time tracking feature
switch (GetParam()) {
case SeqnoTimeTestType::kTrackInternalTimeSeconds:
options.preclude_last_level_data_seconds = 0;
options.preserve_internal_time_seconds = track_time_duration;
break;
case SeqnoTimeTestType::kPrecludeLastLevel:
options.preclude_last_level_data_seconds = track_time_duration;
options.preserve_internal_time_seconds = 0;
break;
case SeqnoTimeTestType::kBothSetTrackSmaller:
options.preclude_last_level_data_seconds = track_time_duration;
options.preserve_internal_time_seconds = track_time_duration / 10;
break;
}
}
};
INSTANTIATE_TEST_CASE_P(
SeqnoTimeTablePropTest, SeqnoTimeTablePropTest,
::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds,
SeqnoTimeTestType::kPrecludeLastLevel,
SeqnoTimeTestType::kBothSetTrackSmaller));
TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) {
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.env = mock_env_.get();
options.disable_auto_compactions = true;
DestroyAndReopen(options);
std::set<uint64_t> checked_file_nums;
SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber() + 1;
uint64_t start_time = mock_clock_->NowSeconds();
// Write a key every 10 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
// about ~20 seqs->time entries, because the sample rate is 10000/100, and it
// passes 2k time. Add (roughly) one for starting entry.
// Revised: with automatic pre-population of mappings, some of these entries
// might be purged to keep the DB mapping within capacity.
EXPECT_GE(seqs.size(), 20 / 2);
EXPECT_LE(seqs.size(), 22);
auto ValidateProximalSeqnos = [&](const char* name, double fuzz_ratio) {
SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber() + 1;
uint64_t end_time = mock_clock_->NowSeconds();
uint64_t seqno_fuzz =
static_cast<uint64_t>((seq_end - start_seq) * fuzz_ratio + 0.999999);
for (unsigned time_pct = 0; time_pct <= 100; time_pct++) {
SCOPED_TRACE("name=" + std::string(name) +
" time_pct=" + std::to_string(time_pct));
// Validate the important proximal API (GetProximalSeqnoBeforeTime)
uint64_t t = start_time + time_pct * (end_time - start_time) / 100;
auto seqno_reported = tp_mapping.GetProximalSeqnoBeforeTime(t);
auto seqno_expected = start_seq + time_pct * (seq_end - start_seq) / 100;
EXPECT_LE(seqno_reported, seqno_expected);
if (end_time - t < 10000) {
EXPECT_LE(seqno_expected, seqno_reported + seqno_fuzz);
}
}
start_seq = seq_end;
start_time = end_time;
};
ValidateProximalSeqnos("a", 0.1);
checked_file_nums.insert(it->second->orig_file_number);
// Write a key every 1 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 190), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); });
}
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 2);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
seqs = tp_mapping.TEST_GetInternalMapping();
// There only a few time sample
ASSERT_GE(seqs.size(), 1);
ASSERT_LE(seqs.size(), 3);
// High fuzz ratio because of low number of samples
ValidateProximalSeqnos("b", 0.5);
checked_file_nums.insert(it->second->orig_file_number);
// Write a key every 200 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 380), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
}
// seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 3);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
seqs = tp_mapping.TEST_GetInternalMapping();
// For the preserved time span, only 10000/200=50 (+1) entries were recorded
ASSERT_GE(seqs.size(), 50);
ASSERT_LE(seqs.size(), 51);
ValidateProximalSeqnos("c", 0.04);
checked_file_nums.insert(it->second->orig_file_number);
// Write a key every 100 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 570), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 4);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
seqs = tp_mapping.TEST_GetInternalMapping();
// For the preserved time span, max entries were recorded and
// preserved (10000/100=100 (+1))
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
checked_file_nums.insert(it->second->orig_file_number);
// re-enable compaction
ASSERT_OK(dbfull()->SetOptions({
{"disable_auto_compactions", "false"},
}));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_GE(tables_props.size(), 1);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
ValidateProximalSeqnos("d", 0.02);
ASSERT_OK(db_->Close());
}
TEST_P(SeqnoTimeTablePropTest, MultiCFs) {
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 0;
options.preserve_internal_time_seconds = 0;
options.env = mock_env_.get();
options.stats_dump_period_sec = 0;
options.stats_persist_period_sec = 0;
ReopenWithColumnFamilies({"default"}, options);
const PeriodicTaskScheduler& scheduler =
dbfull()->TEST_GetPeriodicTaskScheduler();
ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
// Write some data and increase the current time
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
Options options_1 = options;
SetTrackTimeDurationOptions(10000, options_1);
CreateColumnFamilies({"one"}, options_1);
ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
// Write some data to the default CF (without preclude_last_level feature)
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
// Write some data to the CF one
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(1, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush(1));
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 1);
ASSERT_LE(seqs.size(), 4);
// Create one more CF with larger preclude_last_level time
Options options_2 = options;
SetTrackTimeDurationOptions(1000000, options_2); // 1m
CreateColumnFamilies({"two"}, options_2);
// Add more data to CF "two" to fill the in memory mapping
for (int i = 0; i < 2000; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 1000 - 1);
// Non-strict limit can exceed capacity by a reasonable fraction
ASSERT_LE(seqs.size(), 1000 * 9 / 8);
ASSERT_OK(Flush(2));
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
tp_mapping.Clear();
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
seqs = tp_mapping.TEST_GetInternalMapping();
// the max encoded entries is 100
ASSERT_GE(seqs.size(), 100 - 1);
ASSERT_LE(seqs.size(), 100 + 1);
// Write some data to default CF, as all memtable with preclude_last_level
// enabled have flushed, the in-memory seqno->time mapping should be cleared
for (int i = 0; i < 10; i++) {
ASSERT_OK(Put(0, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
ASSERT_OK(Flush(0));
// trigger compaction for CF "two" and make sure the compaction output has
// seqno_to_time_mapping
for (int j = 0; j < 3; j++) {
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush(2));
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
tp_mapping.Clear();
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(0, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush(0));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
// Write some data to CF "two", but don't flush to accumulate
for (int i = 0; i < 1000; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_GE(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
500);
// After dropping CF "one", the in-memory mapping will be change to only
// follow CF "two" options.
ASSERT_OK(db_->DropColumnFamily(handles_[1]));
ASSERT_LE(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
100 + 5);
// After dropping CF "two", the in-memory mapping is also clear.
ASSERT_OK(db_->DropColumnFamily(handles_[2]));
ASSERT_EQ(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
0);
// And the timer worker is stopped
ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
Close();
}
TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
const int kInstanceNum = 2;
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.env = mock_env_.get();
options.stats_dump_period_sec = 0;
options.stats_persist_period_sec = 0;
auto dbs = std::vector<DB*>(kInstanceNum);
for (int i = 0; i < kInstanceNum; i++) {
ASSERT_OK(
DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
}
// Make sure the second instance has the worker enabled
auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
WriteOptions wo;
for (int i = 0; i < 200; i++) {
ASSERT_OK(dbi->Put(wo, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping();
ASSERT_GT(seqno_to_time_mapping.Size(), 10);
for (int i = 0; i < kInstanceNum; i++) {
ASSERT_OK(dbs[i]->Close());
delete dbs[i];
}
}
TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.compaction_style = kCompactionStyleUniversal;
options.num_levels = kNumLevels;
options.env = mock_env_.get();
DestroyAndReopen(options);
std::atomic_uint64_t num_seqno_zeroing{0};
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"CompactionIterator::PrepareOutput:ZeroingSeq",
[&](void* /*arg*/) { num_seqno_zeroing++; });
SyncPoint::GetInstance()->EnableProcessing();
int sst_num = 0;
for (; sst_num < kNumTrigger - 1; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 3);
for (const auto& props : tables_props) {
ASSERT_FALSE(props.second->seqno_to_time_mapping.empty());
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.DecodeFrom(props.second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
// Add (roughly) one for starting entry.
ASSERT_GE(seqs.size(), 10);
ASSERT_LE(seqs.size(), 10 + 2);
}
// Trigger a compaction
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
sst_num++;
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
// compact to the last level
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// make sure the data is all compacted to penultimate level if the feature is
// on, otherwise, compacted to the last level.
if (options.preclude_last_level_data_seconds > 0) {
ASSERT_GT(NumTableFilesAtLevel(5), 0);
ASSERT_EQ(NumTableFilesAtLevel(6), 0);
} else {
ASSERT_EQ(NumTableFilesAtLevel(5), 0);
ASSERT_GT(NumTableFilesAtLevel(6), 0);
}
// regardless the file is on the last level or not, it should keep the time
// information and sequence number are not set
tables_props.clear();
tp_mapping.Clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
ASSERT_EQ(num_seqno_zeroing, 0);
it = tables_props.begin();
ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
ASSERT_OK(tp_mapping.DecodeFrom(it->second->seqno_to_time_mapping));
ASSERT_TRUE(tp_mapping.TEST_IsEnforced());
// make half of the data expired
mock_clock_->MockSleepForSeconds(static_cast<int>(8000));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
tables_props.clear();
tp_mapping.Clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
if (options.preclude_last_level_data_seconds > 0) {
ASSERT_EQ(tables_props.size(), 2);
} else {
ASSERT_EQ(tables_props.size(), 1);
}
ASSERT_GT(num_seqno_zeroing, 0);
std::vector<KeyVersion> key_versions;
ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
std::numeric_limits<size_t>::max(),
&key_versions));
// make sure there're more than 300 keys and first 100 keys are having seqno
// zeroed out, the last 100 key seqno not zeroed out
ASSERT_GT(key_versions.size(), 300);
for (int i = 0; i < 100; i++) {
ASSERT_EQ(key_versions[i].sequence, 0);
}
auto rit = key_versions.rbegin();
for (int i = 0; i < 100; i++) {
ASSERT_GT(rit->sequence, 0);
rit++;
}
// make all data expired and compact again to push it to the last level
// regardless if the tiering feature is enabled or not
mock_clock_->MockSleepForSeconds(static_cast<int>(20000));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_GT(num_seqno_zeroing, 0);
ASSERT_GT(NumTableFilesAtLevel(6), 0);
Close();
}
TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) {
Options base_options = CurrentOptions();
base_options.env = mock_env_.get();
base_options.disable_auto_compactions = true;
base_options.create_missing_column_families = true;
Options track_options = base_options;
constexpr uint32_t kPreserveSecs = 1234567;
SetTrackTimeDurationOptions(kPreserveSecs, track_options);
SeqnoToTimeMapping sttm;
SequenceNumber latest_seqno;
uint64_t start_time, end_time;
// #### DB#1, #2: No pre-population without preserve/preclude ####
// #### But a single entry is added when preserve/preclude enabled ####
for (bool with_write : {false, true}) {
SCOPED_TRACE("with_write=" + std::to_string(with_write));
DestroyAndReopen(base_options);
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_TRUE(sttm.Empty());
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
if (with_write) {
// Ensure that writes before new CF with preserve/preclude option don't
// interfere with the seqno-to-time mapping getting a starting entry.
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
} else {
// FIXME: currently, starting entry after CreateColumnFamily requires
// non-zero seqno
ASSERT_OK(Delete("blah"));
}
// Unfortunately, if we add a CF with preserve/preclude option after
// open, that does not reserve seqnos with pre-populated time mappings.
CreateColumnFamilies({"one"}, track_options);
// No pre-population (unfortunately), just a single starting entry
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), 1);
ASSERT_EQ(latest_seqno, 1U);
// Current time maps to starting entry / seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U);
// Any older times are unknown.
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1),
kUnknownSeqnoBeforeAll);
// Now check that writes can proceed normally (passing about 20% of preserve
// time)
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kPreserveSecs / 99));
});
}
ASSERT_OK(Flush());
// Check that mappings are getting populated
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
end_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), 21);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1),
kUnknownSeqnoBeforeAll);
}
// ### DB#3, #4: Read-only DB with preserve/preclude after not ####
// Make sure we don't hit issues with read-only DBs, which don't need
// the mapping in the DB state (though it wouldn't hurt anything)
for (bool with_write : {false, true}) {
SCOPED_TRACE("with_write=" + std::to_string(with_write));
DestroyAndReopen(base_options);
if (with_write) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
}
ASSERT_OK(ReadOnlyReopen(base_options));
if (with_write) {
ASSERT_EQ(Get("foo"), "bar");
}
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
if (!with_write) {
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
}
ASSERT_OK(ReadOnlyReopen(track_options));
if (with_write) {
ASSERT_EQ(Get("foo"), "bar");
}
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
if (!with_write) {
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
// And even if we re-open read-write, we do not get pre-population,
// because that's only for new DBs.
Reopen(track_options);
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
}
}
// #### DB#5: Destroy and open with preserve/preclude option ####
DestroyAndReopen(track_options);
// Ensure pre-population
constexpr auto kPrePopPairs = kMaxSeqnoTimePairsPerSST;
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
// One nono-zero sequence number per pre-populated pair (this could be
// revised if we want to use interpolation for better approximate time
// mappings with no guarantee of erring in just one direction).
ASSERT_EQ(latest_seqno, kPrePopPairs);
// Current time maps to last pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno);
// Oldest tracking time maps to first pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1);
// In more detail, check that estimated seqnos (pre-allocated) are uniformly
// spread over the tracked time.
for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) {
// Round up query time
uint64_t t = start_time - kPreserveSecs +
static_cast<uint64_t>(ratio * kPreserveSecs + 0.9999999);
// Round down estimated seqno
SequenceNumber s =
static_cast<SequenceNumber>(ratio * (latest_seqno - 1)) + 1;
// Match
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(t), s);
}
// Now check that writes can proceed normally (passing about 20% of preserve
// time)
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kPreserveSecs / 99));
});
}
ASSERT_OK(Flush());
// Can still see some pre-populated mappings, though some displaced
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
end_time = mock_clock_->NowSeconds();
ASSERT_GE(sttm.Size(), kPrePopPairs);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs / 2),
kPrePopPairs / 2);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs),
kUnknownSeqnoBeforeAll);
// Make sure we don't hit issues with read-only DBs, which don't need
// the mapping in the DB state (though it wouldn't hurt anything)
ASSERT_OK(ReadOnlyReopen(track_options));
ASSERT_EQ(Get(Key(0)), "value");
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
// #### DB#6: Destroy and open+create an extra CF with preserve/preclude ####
// (default CF does not have the option)
Destroy(track_options);
ReopenWithColumnFamilies({"default", "one"},
List({base_options, track_options}));
// Ensure pre-population (not as exhaustive checking here)
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
// One nono-zero sequence number per pre-populated pair (this could be
// revised if we want to use interpolation for better approximate time
// mappings with no guarantee of erring in just one direction).
ASSERT_EQ(latest_seqno, kPrePopPairs);
// Current time maps to last pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno);
// Oldest tracking time maps to first pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1);
// Even after no writes and DB re-open without tracking options, sequence
// numbers should not go backward into those that were pre-allocated.
// (Future work: persist the mapping)
ReopenWithColumnFamilies({"default", "one"},
List({base_options, base_options}));
ASSERT_EQ(latest_seqno, db_->GetLatestSequenceNumber());
Close();
}
TEST_F(SeqnoTimeTest, MappingAppend) {
using P = SeqnoToTimeMapping::SeqnoTimePair;
SeqnoToTimeMapping test;
test.SetMaxTimeSpan(100).SetCapacity(10);
// ignore seqno == 0, as it may mean the seqno is zeroed out
ASSERT_FALSE(test.Append(0, 100));
ASSERT_TRUE(test.Append(3, 200));
auto size = test.Size();
// normal add
ASSERT_TRUE(test.Append(10, 300));
size++;
ASSERT_EQ(size, test.Size());
// Append with the same seqno, newer time is rejected because that makes
// GetProximalSeqnoBeforeTime queries worse (see later test)
ASSERT_FALSE(test.Append(10, 301));
ASSERT_EQ(size, test.Size());
ASSERT_EQ(test.TEST_GetLastEntry(), P({10, 300}));
// Same or new seqno with same or older time (as last successfully added) is
// accepted by replacing last entry (improves GetProximalSeqnoBeforeTime
// queries without blowing up size)
ASSERT_FALSE(test.Append(10, 299));
ASSERT_EQ(size, test.Size());
ASSERT_EQ(test.TEST_GetLastEntry(), P({10, 299}));
ASSERT_FALSE(test.Append(11, 299));
ASSERT_EQ(size, test.Size());
ASSERT_EQ(test.TEST_GetLastEntry(), P({11, 299}));
ASSERT_FALSE(test.Append(11, 250));
ASSERT_EQ(size, test.Size());
ASSERT_EQ(test.TEST_GetLastEntry(), P({11, 250}));
}
TEST_F(SeqnoTimeTest, CapacityLimits) {
using P = SeqnoToTimeMapping::SeqnoTimePair;
SeqnoToTimeMapping test;
test.SetCapacity(3);
EXPECT_TRUE(test.Append(10, 300));
EXPECT_TRUE(test.Append(20, 400));
EXPECT_TRUE(test.Append(30, 500));
EXPECT_TRUE(test.Append(40, 600));
// Capacity 3 is small enough that the non-strict limit is
// equal to the strict limit.
EXPECT_EQ(3U, test.Size());
EXPECT_EQ(test.TEST_GetLastEntry(), P({40, 600}));
// Same for Capacity 2
test.SetCapacity(2);
EXPECT_EQ(2U, test.Size());
EXPECT_EQ(test.TEST_GetLastEntry(), P({40, 600}));
EXPECT_TRUE(test.Append(50, 700));
EXPECT_EQ(2U, test.Size());
EXPECT_EQ(test.TEST_GetLastEntry(), P({50, 700}));
// Capacity 1 is difficult to work with internally, so is
// coerced to 2.
test.SetCapacity(1);
EXPECT_EQ(2U, test.Size());
EXPECT_EQ(test.TEST_GetLastEntry(), P({50, 700}));
EXPECT_TRUE(test.Append(60, 800));
EXPECT_EQ(2U, test.Size());
EXPECT_EQ(test.TEST_GetLastEntry(), P({60, 800}));
// Capacity 0 means throw everything away
test.SetCapacity(0);
EXPECT_EQ(0U, test.Size());
EXPECT_FALSE(test.Append(70, 900));
EXPECT_EQ(0U, test.Size());
// Unlimited capacity
test.SetCapacity(UINT64_MAX);
for (unsigned i = 1; i <= 10101U; i++) {
EXPECT_TRUE(test.Append(i, 11U * i));
}
EXPECT_EQ(10101U, test.Size());
}
TEST_F(SeqnoTimeTest, TimeSpanLimits) {
SeqnoToTimeMapping test;
// Default: no limit
for (unsigned i = 1; i <= 63U; i++) {
EXPECT_TRUE(test.Append(1000 + i, uint64_t{1} << i));
}
// None dropped.
EXPECT_EQ(63U, test.Size());
test.Clear();
// Explicit no limit
test.SetMaxTimeSpan(UINT64_MAX);
for (unsigned i = 1; i <= 63U; i++) {
EXPECT_TRUE(test.Append(1000 + i, uint64_t{1} << i));
}
// None dropped.
EXPECT_EQ(63U, test.Size());
// We generally keep 2 entries as long as the configured max time span
// is non-zero
test.SetMaxTimeSpan(10);
EXPECT_EQ(2U, test.Size());
test.SetMaxTimeSpan(1);
EXPECT_EQ(2U, test.Size());
// But go down to 1 entry if the max time span is zero
test.SetMaxTimeSpan(0);
EXPECT_EQ(1U, test.Size());
EXPECT_TRUE(test.Append(2000, (uint64_t{1} << 63) + 42U));
EXPECT_EQ(1U, test.Size());
test.Clear();
// Test more typical behavior. Note that one entry at or beyond the max span
// is kept.
test.SetMaxTimeSpan(100);
EXPECT_TRUE(test.Append(1001, 123));
EXPECT_TRUE(test.Append(1002, 134));
EXPECT_TRUE(test.Append(1003, 150));
EXPECT_TRUE(test.Append(1004, 189));
EXPECT_TRUE(test.Append(1005, 220));
EXPECT_EQ(5U, test.Size());
EXPECT_TRUE(test.Append(1006, 233));
EXPECT_EQ(6U, test.Size());
EXPECT_TRUE(test.Append(1007, 234));
EXPECT_EQ(6U, test.Size());
EXPECT_TRUE(test.Append(1008, 235));
EXPECT_EQ(7U, test.Size());
EXPECT_TRUE(test.Append(1009, 300));
EXPECT_EQ(6U, test.Size());
EXPECT_TRUE(test.Append(1010, 350));
EXPECT_EQ(3U, test.Size());
EXPECT_TRUE(test.Append(1011, 470));
EXPECT_EQ(2U, test.Size());
}
TEST_F(SeqnoTimeTest, ProximalFunctions) {
SeqnoToTimeMapping test;
test.SetCapacity(10);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U),
kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U),
kUnknownSeqnoBeforeAll);
// (Taken from example in SeqnoToTimeMapping class comment)
// Time 500 is after seqno 10 and before seqno 11
EXPECT_TRUE(test.Append(10, 500));
// Seqno too early
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(9), kUnknownTimeBeforeAll);
// We only know that 500 is after 10
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
// Found
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 500U);
// Time too early
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll);
// Found
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 10U);
// More samples
EXPECT_TRUE(test.Append(20, 600));
EXPECT_TRUE(test.Append(30, 700));
EXPECT_EQ(test.Size(), 3U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(20), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(21), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(601), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(701), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 30U);
// Redundant sample ignored
EXPECT_EQ(test.Size(), 3U);
EXPECT_FALSE(test.Append(30, 700));
EXPECT_EQ(test.Size(), 3U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// Later sample with same seqno is ignored, to provide best results
// for GetProximalSeqnoBeforeTime function while saving entries
// in SeqnoToTimeMapping.
EXPECT_FALSE(test.Append(30, 800));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
// Could return 800, but saving space in SeqnoToTimeMapping instead.
// Can reconsider if/when GetProximalTimeBeforeSeqno is used in
// production.
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
// If the existing {30, 700} entry were replaced with {30, 800}, this
// would return seqno 20 instead of 30, which would preclude more than
// necessary for "preclude_last_level_data_seconds" feature.
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(800), 30U);
// Still OK
EXPECT_TRUE(test.Append(40, 900));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(41), 900U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 40U);
// Burst of writes during a short time creates an opportunity
// for better results from GetProximalSeqnoBeforeTime(), at the
// expense of GetProximalTimeBeforeSeqno(). False return indicates
// merge with previous entry.
EXPECT_FALSE(test.Append(50, 900));
// These are subject to later revision depending on priorities
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(49), 700U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(51), 900U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 50U);
}
TEST_F(SeqnoTimeTest, PrePopulate) {
SeqnoToTimeMapping test;
test.SetMaxTimeSpan(100).SetCapacity(10);
EXPECT_EQ(test.Size(), 0U);
// Smallest case is like two Appends
test.PrePopulate(10, 11, 500, 600);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(12), 600U);
test.Clear();
// Populate a small range
uint64_t kTimeIncrement = 1234567;
test.PrePopulate(1, 12, kTimeIncrement, kTimeIncrement * 2);
for (uint64_t i = 0; i <= 12; ++i) {
// NOTE: with 1 and 12 as the pre-populated end points, the duration is
// broken into 11 equal(-ish) spans
uint64_t t = kTimeIncrement + (i * kTimeIncrement) / 11 - 1;
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), i);
}
test.Clear();
// Populate an excessively large range (in the future we might want to
// interpolate estimated times for seqnos between entries)
test.PrePopulate(1, 34567, kTimeIncrement, kTimeIncrement * 2);
for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) {
// Round up query time
uint64_t t = kTimeIncrement +
static_cast<uint64_t>(ratio * kTimeIncrement + 0.9999999);
// Round down estimated seqno
SequenceNumber s = static_cast<SequenceNumber>(ratio * (34567 - 1)) + 1;
// Match
// TODO: for now this is exact, but in the future might need approximation
// bounds to account for limited samples.
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), s);
}
}
TEST_F(SeqnoTimeTest, CopyFromSeqnoRange) {
SeqnoToTimeMapping test_from;
SeqnoToTimeMapping test_to;
// With zero to draw from
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 1000000);
EXPECT_EQ(test_to.Size(), 0U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 100, 100);
EXPECT_EQ(test_to.Size(), 0U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, kMaxSequenceNumber, 0);
EXPECT_EQ(test_to.Size(), 0U);
// With one to draw from
EXPECT_TRUE(test_from.Append(10, 500));
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 1000000);
EXPECT_EQ(test_to.Size(), 1U);
// Includes one entry before range
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 100, 100);
EXPECT_EQ(test_to.Size(), 1U);
// Includes one entry before range (even if somewhat nonsensical)
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, kMaxSequenceNumber, 0);
EXPECT_EQ(test_to.Size(), 1U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 9);
EXPECT_EQ(test_to.Size(), 0U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 10);
EXPECT_EQ(test_to.Size(), 1U);
// With more to draw from
EXPECT_TRUE(test_from.Append(20, 600));
EXPECT_TRUE(test_from.Append(30, 700));
EXPECT_TRUE(test_from.Append(40, 800));
EXPECT_TRUE(test_from.Append(50, 900));
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 1000000);
EXPECT_EQ(test_to.Size(), 5U);
// Includes one entry before range
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 100, 100);
EXPECT_EQ(test_to.Size(), 1U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 19, 19);
EXPECT_EQ(test_to.Size(), 1U);
// Includes one entry before range (even if somewhat nonsensical)
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, kMaxSequenceNumber, 0);
EXPECT_EQ(test_to.Size(), 1U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 9);
EXPECT_EQ(test_to.Size(), 0U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 0, 10);
EXPECT_EQ(test_to.Size(), 1U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 20, 20);
EXPECT_EQ(test_to.Size(), 2U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 20, 29);
EXPECT_EQ(test_to.Size(), 2U);
test_to.Clear();
test_to.CopyFromSeqnoRange(test_from, 20, 30);
EXPECT_EQ(test_to.Size(), 3U);
}
TEST_F(SeqnoTimeTest, EnforceWithNow) {
constexpr uint64_t kMaxTimeSpan = 420;
SeqnoToTimeMapping test;
test.SetMaxTimeSpan(kMaxTimeSpan).SetCapacity(10);
EXPECT_EQ(test.Size(), 0U);
// Safe on empty mapping
test.Enforce(/*now=*/500);
EXPECT_EQ(test.Size(), 0U);
// (Taken from example in SeqnoToTimeMapping class comment)
// Time 500 is after seqno 10 and before seqno 11
EXPECT_TRUE(test.Append(10, 500));
EXPECT_TRUE(test.Append(20, 600));
EXPECT_TRUE(test.Append(30, 700));
EXPECT_TRUE(test.Append(40, 800));
EXPECT_TRUE(test.Append(50, 900));
EXPECT_EQ(test.Size(), 5U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// etc.
// Must keep first entry
test.Enforce(/*now=*/500 + kMaxTimeSpan);
EXPECT_EQ(test.Size(), 5U);
test.Enforce(/*now=*/599 + kMaxTimeSpan);
EXPECT_EQ(test.Size(), 5U);
// Purges first entry
test.Enforce(/*now=*/600 + kMaxTimeSpan);
EXPECT_EQ(test.Size(), 4U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// No effect
test.Enforce(/*now=*/600 + kMaxTimeSpan);
EXPECT_EQ(test.Size(), 4U);
test.Enforce(/*now=*/699 + kMaxTimeSpan);
EXPECT_EQ(test.Size(), 4U);
// Purges next two
test.Enforce(/*now=*/899 + kMaxTimeSpan);
EXPECT_EQ(test.Size(), 2U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(799), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 40U);
// Always keep last entry, to have a non-trivial seqno bound
test.Enforce(/*now=*/10000000);
EXPECT_EQ(test.Size(), 1U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(10000000), 50U);
}
TEST_F(SeqnoTimeTest, Sort) {
SeqnoToTimeMapping test;
// single entry
test.AddUnenforced(10, 11);
test.Enforce();
ASSERT_EQ(test.Size(), 1);
// duplicate is ignored
test.AddUnenforced(10, 11);
test.Enforce();
ASSERT_EQ(test.Size(), 1);
// add some revised mappings for that seqno
test.AddUnenforced(10, 10);
test.AddUnenforced(10, 12);
// We currently favor GetProximalSeqnoBeforeTime over
// GetProximalTimeBeforeSeqno by keeping the older time.
test.Enforce();
auto seqs = test.TEST_GetInternalMapping();
std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
expected.emplace_back(10, 10);
ASSERT_EQ(expected, seqs);
// add an inconsistent / unuseful mapping
test.AddUnenforced(9, 11);
test.Enforce();
seqs = test.TEST_GetInternalMapping();
ASSERT_EQ(expected, seqs);
// And a mapping that is considered more useful (for
// GetProximalSeqnoBeforeTime) and thus replaces that one
test.AddUnenforced(11, 9);
test.Enforce();
seqs = test.TEST_GetInternalMapping();
expected.clear();
expected.emplace_back(11, 9);
ASSERT_EQ(expected, seqs);
// Add more good, non-mergable entries
test.AddUnenforced(1, 5);
test.AddUnenforced(100, 100);
test.Enforce();
seqs = test.TEST_GetInternalMapping();
expected.clear();
expected.emplace_back(1, 5);
expected.emplace_back(11, 9);
expected.emplace_back(100, 100);
ASSERT_EQ(expected, seqs);
}
TEST_F(SeqnoTimeTest, EncodeDecodeBasic) {
constexpr uint32_t kOriginalSamples = 1000;
SeqnoToTimeMapping test;
test.SetCapacity(kOriginalSamples);
std::string output;
test.EncodeTo(output);
ASSERT_TRUE(output.empty());
ASSERT_OK(test.DecodeFrom(output));
ASSERT_EQ(test.Size(), 0U);
Random rnd(123);
for (uint32_t i = 1; i <= kOriginalSamples; i++) {
ASSERT_TRUE(test.Append(i, i * 10 + rnd.Uniform(10)));
}
output.clear();
test.EncodeTo(output);
ASSERT_FALSE(output.empty());
SeqnoToTimeMapping decoded;
ASSERT_OK(decoded.DecodeFrom(output));
ASSERT_TRUE(decoded.TEST_IsEnforced());
ASSERT_EQ(test.Size(), decoded.Size());
ASSERT_EQ(test.TEST_GetInternalMapping(), decoded.TEST_GetInternalMapping());
// Encode a reduced set of mappings
constexpr uint32_t kReducedSize = 51U;
output.clear();
SeqnoToTimeMapping(test).SetCapacity(kReducedSize).EncodeTo(output);
decoded.Clear();
ASSERT_OK(decoded.DecodeFrom(output));
ASSERT_TRUE(decoded.TEST_IsEnforced());
ASSERT_EQ(decoded.Size(), kReducedSize);
for (uint64_t t = 1; t <= kOriginalSamples * 11; t += 1 + t / 100) {
SCOPED_TRACE("t=" + std::to_string(t));
// `test` has the more accurate time mapping, but the reduced set should
// nicely span and approximate the whole range
auto orig_s = test.GetProximalSeqnoBeforeTime(t);
auto approx_s = decoded.GetProximalSeqnoBeforeTime(t);
// The oldest entry should be preserved exactly
ASSERT_EQ(orig_s == kUnknownSeqnoBeforeAll,
approx_s == kUnknownSeqnoBeforeAll);
// The newest entry should be preserved exactly
ASSERT_EQ(orig_s == kOriginalSamples, approx_s == kOriginalSamples);
// Approximate seqno before time should err toward older seqno to avoid
// classifying data as old too early, but should be within a reasonable
// bound.
constexpr uint32_t kSeqnoFuzz = kOriginalSamples * 3 / 2 / kReducedSize;
EXPECT_GE(approx_s + kSeqnoFuzz, orig_s);
EXPECT_GE(orig_s, approx_s);
}
}
TEST_F(SeqnoTimeTest, EncodeDecodeMinimizeTimeGaps) {
SeqnoToTimeMapping test;
test.SetCapacity(10);
test.Append(1, 10);
test.Append(5, 17);
test.Append(6, 25);
test.Append(8, 30);
std::string output;
SeqnoToTimeMapping(test).SetCapacity(3).EncodeTo(output);
SeqnoToTimeMapping decoded;
ASSERT_OK(decoded.DecodeFrom(output));
ASSERT_TRUE(decoded.TEST_IsEnforced());
ASSERT_EQ(decoded.Size(), 3);
auto seqs = decoded.TEST_GetInternalMapping();
std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
expected.emplace_back(1, 10);
expected.emplace_back(5, 17);
expected.emplace_back(8, 30);
ASSERT_EQ(expected, seqs);
// Add a few large time number
test.Append(10, 100);
test.Append(13, 200);
test.Append(40, 250);
test.Append(70, 300);
output.clear();
SeqnoToTimeMapping(test).SetCapacity(4).EncodeTo(output);
decoded.Clear();
ASSERT_OK(decoded.DecodeFrom(output));
ASSERT_TRUE(decoded.TEST_IsEnforced());
ASSERT_EQ(decoded.Size(), 4);
expected.clear();
// Except for beginning and end, entries are removed that minimize the
// remaining time gaps, regardless of seqno gaps.
expected.emplace_back(1, 10);
expected.emplace_back(10, 100);
expected.emplace_back(13, 200);
expected.emplace_back(70, 300);
seqs = decoded.TEST_GetInternalMapping();
ASSERT_EQ(expected, seqs);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}