rocksdb/db/seqno_time_test.cc

1380 lines
49 KiB
C++
Raw Normal View History

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/db_test_util.h"
#include "db/periodic_task_scheduler.h"
#include "db/seqno_to_time_mapping.h"
#include "port/stack_trace.h"
#include "rocksdb/iostats_context.h"
#include "rocksdb/utilities/debug.h"
#include "test_util/mock_time_env.h"
namespace ROCKSDB_NAMESPACE {
class SeqnoTimeTest : public DBTestBase {
public:
SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) {
mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
mock_clock_->SetCurrentTime(kMockStartTime);
mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
}
protected:
std::unique_ptr<Env> mock_env_;
std::shared_ptr<MockSystemClock> mock_clock_;
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
// Sufficient starting time that preserve time doesn't under-flow into
// pre-history
static constexpr uint32_t kMockStartTime = 10000000;
void SetUp() override {
mock_clock_->InstallTimedWaitFixCallback();
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
auto periodic_task_scheduler_ptr =
reinterpret_cast<PeriodicTaskScheduler*>(arg);
periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
});
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
mock_clock_->SetCurrentTime(kMockStartTime);
}
// make sure the file is not in cache, otherwise it won't have IO info
void AssertKeyTemperature(int key_id, Temperature expected_temperature) {
get_iostats_context()->Reset();
IOStatsContext* iostats = get_iostats_context();
std::string result = Get(Key(key_id));
ASSERT_FALSE(result.empty());
ASSERT_GT(iostats->bytes_read, 0);
switch (expected_temperature) {
case Temperature::kUnknown:
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
case Temperature::kCold:
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
0);
ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
0);
break;
default:
// the test only support kCold now for the bottommost temperature
FAIL();
}
}
};
TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
const int kKeyPerSec = 10;
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
DestroyAndReopen(options);
int sst_num = 0;
// Write files that are overlap and enough to trigger compaction
for (; sst_num < kNumTrigger; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
}
Add WaitForCompact with WaitForCompactOptions to public API (#11436) Summary: Context: This is the first PR for WaitForCompact() Implementation with WaitForCompactOptions. In this PR, we are introducing `Status WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` in the public API. This currently utilizes the existing internal `WaitForCompact()` implementation (with default abort_on_pause = false). `abort_on_pause` has been moved to `WaitForCompactOptions&`. In the later PRs, we will introduce the following two options in `WaitForCompactOptions` 1. `bool flush = false` by default - If true, flush before waiting for compactions to finish. Must be set to true to ensure no immediate compactions (except perhaps periodic compactions) after closing and re-opening the DB. 2. `bool close_db = false` by default - If true, will also close the DB upon compactions finishing. 1. struct `WaitForCompactOptions` added to options.h and `abort_on_pause` in the internal API moved to the option struct. 2. `Status WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` introduced in `db.h` 3. Changed the internal WaitForCompact() to `WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` and checks for the `abort_on_pause` inside the option. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11436 Test Plan: Following tests added - `DBCompactionTest::WaitForCompactWaitsOnCompactionToFinish` - `DBCompactionTest::WaitForCompactAbortOnPauseAborted` - `DBCompactionTest::WaitForCompactContinueAfterPauseNotAborted` - `DBCompactionTest::WaitForCompactShutdownWhileWaiting` - `TransactionTest::WaitForCompactAbortOnPause` NOTE: `TransactionTest::WaitForCompactAbortOnPause` was added to use `StackableDB` to ensure the wrapper function is in place. Reviewed By: pdillinger Differential Revision: D45799659 Pulled By: jaykorean fbshipit-source-id: b5b58f95957f2ab47d1221dee32a61d6cdc4685b
2023-05-26 00:25:51 +00:00
ASSERT_OK(dbfull()->TEST_WaitForCompact());
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKeyTemperature(20, Temperature::kUnknown);
// Write more data, but still all hot until the 10th SST, as:
// write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
// The preclude_last_level_data_seconds is 10k
for (; sst_num < kNumTrigger * 2; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
Add WaitForCompact with WaitForCompactOptions to public API (#11436) Summary: Context: This is the first PR for WaitForCompact() Implementation with WaitForCompactOptions. In this PR, we are introducing `Status WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` in the public API. This currently utilizes the existing internal `WaitForCompact()` implementation (with default abort_on_pause = false). `abort_on_pause` has been moved to `WaitForCompactOptions&`. In the later PRs, we will introduce the following two options in `WaitForCompactOptions` 1. `bool flush = false` by default - If true, flush before waiting for compactions to finish. Must be set to true to ensure no immediate compactions (except perhaps periodic compactions) after closing and re-opening the DB. 2. `bool close_db = false` by default - If true, will also close the DB upon compactions finishing. 1. struct `WaitForCompactOptions` added to options.h and `abort_on_pause` in the internal API moved to the option struct. 2. `Status WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` introduced in `db.h` 3. Changed the internal WaitForCompact() to `WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` and checks for the `abort_on_pause` inside the option. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11436 Test Plan: Following tests added - `DBCompactionTest::WaitForCompactWaitsOnCompactionToFinish` - `DBCompactionTest::WaitForCompactAbortOnPauseAborted` - `DBCompactionTest::WaitForCompactContinueAfterPauseNotAborted` - `DBCompactionTest::WaitForCompactShutdownWhileWaiting` - `TransactionTest::WaitForCompactAbortOnPause` NOTE: `TransactionTest::WaitForCompactAbortOnPause` was added to use `StackableDB` to ensure the wrapper function is in place. Reviewed By: pdillinger Differential Revision: D45799659 Pulled By: jaykorean fbshipit-source-id: b5b58f95957f2ab47d1221dee32a61d6cdc4685b
2023-05-26 00:25:51 +00:00
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
}
// Now we have both hot data and cold data
for (; sst_num < kNumTrigger * 3; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
});
}
ASSERT_OK(Flush());
Add WaitForCompact with WaitForCompactOptions to public API (#11436) Summary: Context: This is the first PR for WaitForCompact() Implementation with WaitForCompactOptions. In this PR, we are introducing `Status WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` in the public API. This currently utilizes the existing internal `WaitForCompact()` implementation (with default abort_on_pause = false). `abort_on_pause` has been moved to `WaitForCompactOptions&`. In the later PRs, we will introduce the following two options in `WaitForCompactOptions` 1. `bool flush = false` by default - If true, flush before waiting for compactions to finish. Must be set to true to ensure no immediate compactions (except perhaps periodic compactions) after closing and re-opening the DB. 2. `bool close_db = false` by default - If true, will also close the DB upon compactions finishing. 1. struct `WaitForCompactOptions` added to options.h and `abort_on_pause` in the internal API moved to the option struct. 2. `Status WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` introduced in `db.h` 3. Changed the internal WaitForCompact() to `WaitForCompact(const WaitForCompactOptions& wait_for_compact_options)` and checks for the `abort_on_pause` inside the option. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11436 Test Plan: Following tests added - `DBCompactionTest::WaitForCompactWaitsOnCompactionToFinish` - `DBCompactionTest::WaitForCompactAbortOnPauseAborted` - `DBCompactionTest::WaitForCompactContinueAfterPauseNotAborted` - `DBCompactionTest::WaitForCompactShutdownWhileWaiting` - `TransactionTest::WaitForCompactAbortOnPause` NOTE: `TransactionTest::WaitForCompactAbortOnPause` was added to use `StackableDB` to ensure the wrapper function is in place. Reviewed By: pdillinger Differential Revision: D45799659 Pulled By: jaykorean fbshipit-source-id: b5b58f95957f2ab47d1221dee32a61d6cdc4685b
2023-05-26 00:25:51 +00:00
ASSERT_OK(dbfull()->TEST_WaitForCompact());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKeyTemperature(20, Temperature::kCold);
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
});
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// the hot/cold data cut off range should be between i * 20 + 200 -> 250
AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown);
AssertKeyTemperature(i * 20 + 200, Temperature::kCold);
}
ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size);
ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size);
// Wait again, the most of the data should be cold after that
// but it may not be all cold, because if there's no new data write to SST,
// the compaction will not get the new seqno->time sampling to decide the last
// a few data's time.
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
// any random data close to the end should be cold
AssertKeyTemperature(1000, Temperature::kCold);
// close explicitly, because the env is local variable which will be released
// first.
Close();
}
TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 10000;
options.env = mock_env_.get();
options.bottommost_temperature = Temperature::kCold;
options.num_levels = kNumLevels;
options.level_compaction_dynamic_level_bytes = true;
// TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
// the penultimate level score > 1, but the hot is not cold enough to compact
// to last level, which will keep triggering compaction.
options.disable_auto_compactions = true;
DestroyAndReopen(options);
int sst_num = 0;
// Write files that are overlap
for (; sst_num < 4; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// All data is hot, only output to penultimate level
ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// read a random key, which should be hot (kUnknown)
AssertKeyTemperature(20, Temperature::kUnknown);
// Adding more data to have mixed hot and cold data
for (; sst_num < 14; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
`CompactRange()` always compacts to bottommost level for leveled compaction (#11468) Summary: currently for leveled compaction, the max output level of a call to `CompactRange()` is pre-computed before compacting each level. This max output level is the max level whose key range overlaps with the manual compaction key range. However, during manual compaction, files in the max output level may be compacted down further by some background compaction. When this background compaction is a trivial move, there is a race condition and the manual compaction may not be able to compact all keys in the specified key range. This PR updates `CompactRange()` to always compact to the bottommost level to make this race condition more unlikely (it can still happen, see more in comment here: https://github.com/cbi42/rocksdb/blob/796f58f42ad1bdbf49e5fcf480763f11583b790e/db/db_impl/db_impl_compaction_flush.cc#L1180C29-L1184). This PR also changes the behavior of CompactRange() when `bottommost_level_compaction=kIfHaveCompactionFilter` (the default option). The old behavior is that, if a compaction filter is provided, CompactRange() always does an intra-level compaction at the final output level for all files in the manual compaction key range. The only exception when `first_overlapped_level = 0` and `max_overlapped_level = 0`. It’s awkward to maintain the same behavior after this PR since we do not compute max_overlapped_level anymore. So the new behavior is similar to kForceOptimized: always does intra-level compaction at the bottommost level, but not including new files generated during this manual compaction. Several unit tests are updated to work with this new manual compaction behavior. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11468 Test Plan: Add new unit tests `DBCompactionTest.ManualCompactionCompactAllKeysInRange*` Reviewed By: ajkr Differential Revision: D46079619 Pulled By: cbi42 fbshipit-source-id: 19d844ba4ec8dc1a0b8af5d2f36ff15820c6e76f
2023-06-01 22:27:29 +00:00
// Second to last level
MoveFilesToLevel(5);
ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
// Compact the files to the last level which should split the hot/cold data
MoveFilesToLevel(6);
uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_GT(hot_data_size, 0);
ASSERT_GT(cold_data_size, 0);
// the first a few key should be cold
AssertKeyTemperature(20, Temperature::kCold);
// Wait some time, with each wait, the cold data is increasing and hot data is
// decreasing
for (int i = 0; i < 30; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
uint64_t pre_hot = hot_data_size;
uint64_t pre_cold = cold_data_size;
hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
cold_data_size = GetSstSizeHelper(Temperature::kCold);
ASSERT_LT(hot_data_size, pre_hot);
ASSERT_GT(cold_data_size, pre_cold);
// the hot/cold cut_off key should be around i * 20 + 400 -> 450
AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown);
AssertKeyTemperature(i * 20 + 400, Temperature::kCold);
}
// Wait again, the most of the data should be cold after that
// hot data might not be empty, because if we don't write new data, there's
// no seqno->time sampling available to the compaction
for (int i = 0; i < 5; i++) {
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
}
// any random data close to the end should be cold
AssertKeyTemperature(1000, Temperature::kCold);
Close();
}
enum class SeqnoTimeTestType : char {
kTrackInternalTimeSeconds = 0,
kPrecludeLastLevel = 1,
kBothSetTrackSmaller = 2,
};
class SeqnoTimeTablePropTest
: public SeqnoTimeTest,
public ::testing::WithParamInterface<SeqnoTimeTestType> {
public:
SeqnoTimeTablePropTest() : SeqnoTimeTest() {}
void SetTrackTimeDurationOptions(uint64_t track_time_duration,
Options& options) const {
// either option set will enable the time tracking feature
switch (GetParam()) {
case SeqnoTimeTestType::kTrackInternalTimeSeconds:
options.preclude_last_level_data_seconds = 0;
options.preserve_internal_time_seconds = track_time_duration;
break;
case SeqnoTimeTestType::kPrecludeLastLevel:
options.preclude_last_level_data_seconds = track_time_duration;
options.preserve_internal_time_seconds = 0;
break;
case SeqnoTimeTestType::kBothSetTrackSmaller:
options.preclude_last_level_data_seconds = track_time_duration;
options.preserve_internal_time_seconds = track_time_duration / 10;
break;
}
}
};
INSTANTIATE_TEST_CASE_P(
SeqnoTimeTablePropTest, SeqnoTimeTablePropTest,
::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds,
SeqnoTimeTestType::kPrecludeLastLevel,
SeqnoTimeTestType::kBothSetTrackSmaller));
TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) {
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.env = mock_env_.get();
options.disable_auto_compactions = true;
DestroyAndReopen(options);
std::set<uint64_t> checked_file_nums;
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber() + 1;
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
uint64_t start_time = mock_clock_->NowSeconds();
// Write a key every 10 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
// about ~20 seqs->time entries, because the sample rate is 10000/100, and it
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
// passes 2k time. Add (roughly) one for starting entry.
ASSERT_GE(seqs.size(), 20);
ASSERT_LE(seqs.size(), 22);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber() + 1;
for (auto i = start_seq; i < seq_end; i++) {
// The result is within the range
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 10 - 100);
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 10);
}
checked_file_nums.insert(it->second->orig_file_number);
start_seq = seq_end;
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
start_time = mock_clock_->NowSeconds();
// Write a key every 1 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 190), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); });
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 2);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
// There only a few time sample
ASSERT_GE(seqs.size(), 1);
ASSERT_LE(seqs.size(), 3);
for (auto i = start_seq; i < seq_end; i++) {
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) - 100);
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq));
}
checked_file_nums.insert(it->second->orig_file_number);
start_seq = seq_end;
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
start_time = mock_clock_->NowSeconds();
// Write a key every 200 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 380), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 3);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
// The sequence number -> time entries should be maxed
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (auto i = start_seq; i < seq_end; i++) {
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
// aged out entries allowed to report time=0
if ((seq_end - i) * 200 <= 10000) {
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 200 - 100);
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
start_time + (i - start_seq) * 200);
}
checked_file_nums.insert(it->second->orig_file_number);
start_seq = seq_end;
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
start_time = mock_clock_->NowSeconds();
// Write a key every 100 seconds
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i + 570), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
seq_end = dbfull()->GetLatestSequenceNumber() + 1;
ASSERT_OK(Flush());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 4);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
checked_file_nums.insert(it->second->orig_file_number);
// re-enable compaction
ASSERT_OK(dbfull()->SetOptions({
{"disable_auto_compactions", "false"},
}));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_GE(tables_props.size(), 1);
it = tables_props.begin();
while (it != tables_props.end()) {
if (!checked_file_nums.count(it->second->orig_file_number)) {
break;
}
it++;
}
ASSERT_TRUE(it != tables_props.end());
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (auto i = start_seq; i < seq_end; i++) {
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
// aged out entries allowed to report time=0
// FIXME: should be <=
if ((seq_end - i) * 100 < 10000) {
ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i),
start_time + (i - start_seq) * 100 - 100);
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i),
Update tiered storage tests (ahead of next change) (#11917) Summary: After https://github.com/facebook/rocksdb/issues/11905, I am preparing a DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping. **Intended follow-up** However, there are a number of test changes I want to make prior to that to make it clear that I am not regressing the tests and production behavior at the same time. * Start mock time in the tests well beyond epoch (time 0) so that we aren't normally reaching into pre-history for current time minus the preserve/preclude duration. * Majorly clean up BasicSeqnoToTimeMapping to avoid confusing hard-coded bounds on GetProximalTimeBeforeSeqno() results. * There is an unresolved/unexplained issue marked with FIXME that should be investigated when GetProximalTimeBeforeSeqno() is put into production. * MultiCFs test was strangely generating 5 L0 files, four of which would be compacted into an L1, and then letting TTL compaction compact 1@L0+1@L1. Changing the starting time of the tests seemed to mess up the TTL compaction. But I suspect the TTL compaction was unintentional, so I've cut it down to just 4 L0 files, which compacts predictably. * Unrelated: allow ROCKSDB_NO_STACK=1 to skip printing a stack trace on assertion failures. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11917 Test Plan: no changes to production code Reviewed By: jowlyzhang Differential Revision: D49841436 Pulled By: pdillinger fbshipit-source-id: 753348ace9c548e82bcb77fcc8b2ffb7a6beeb0a
2023-10-02 23:19:05 +00:00
start_time + (i - start_seq) * 100);
}
ASSERT_OK(db_->Close());
}
TEST_P(SeqnoTimeTablePropTest, MultiCFs) {
Options options = CurrentOptions();
options.preclude_last_level_data_seconds = 0;
options.preserve_internal_time_seconds = 0;
options.env = mock_env_.get();
options.stats_dump_period_sec = 0;
options.stats_persist_period_sec = 0;
ReopenWithColumnFamilies({"default"}, options);
const PeriodicTaskScheduler& scheduler =
dbfull()->TEST_GetPeriodicTaskScheduler();
ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
// Write some data and increase the current time
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
Options options_1 = options;
SetTrackTimeDurationOptions(10000, options_1);
CreateColumnFamilies({"one"}, options_1);
ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
// Write some data to the default CF (without preclude_last_level feature)
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush());
// Write some data to the CF one
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(1, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush(1));
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 1);
ASSERT_LE(seqs.size(), 4);
// Create one more CF with larger preclude_last_level time
Options options_2 = options;
SetTrackTimeDurationOptions(1000000, options_2); // 1m
CreateColumnFamilies({"two"}, options_2);
// Add more data to CF "two" to fill the in memory mapping
for (int i = 0; i < 2000; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 1000 - 1);
ASSERT_LE(seqs.size(), 1000 + 1);
ASSERT_OK(Flush(2));
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
// the max encoded entries is 100
ASSERT_GE(seqs.size(), 100 - 1);
ASSERT_LE(seqs.size(), 100 + 1);
// Write some data to default CF, as all memtable with preclude_last_level
// enabled have flushed, the in-memory seqno->time mapping should be cleared
for (int i = 0; i < 10; i++) {
ASSERT_OK(Put(0, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
ASSERT_OK(Flush(0));
// trigger compaction for CF "two" and make sure the compaction output has
// seqno_to_time_mapping
for (int j = 0; j < 3; j++) {
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush(2));
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
tp_mapping.Clear();
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
seqs = tp_mapping.TEST_GetInternalMapping();
ASSERT_GE(seqs.size(), 99);
ASSERT_LE(seqs.size(), 101);
for (int i = 0; i < 200; i++) {
ASSERT_OK(Put(0, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_OK(Flush(0));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props));
ASSERT_EQ(tables_props.size(), 1);
it = tables_props.begin();
ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
// Write some data to CF "two", but don't flush to accumulate
for (int i = 0; i < 1000; i++) {
ASSERT_OK(Put(2, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
ASSERT_GE(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
500);
// After dropping CF "one", the in-memory mapping will be change to only
// follow CF "two" options.
ASSERT_OK(db_->DropColumnFamily(handles_[1]));
ASSERT_LE(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
100 + 5);
// After dropping CF "two", the in-memory mapping is also clear.
ASSERT_OK(db_->DropColumnFamily(handles_[2]));
ASSERT_EQ(
dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
0);
// And the timer worker is stopped
ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
Close();
}
TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
const int kInstanceNum = 2;
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.env = mock_env_.get();
options.stats_dump_period_sec = 0;
options.stats_persist_period_sec = 0;
auto dbs = std::vector<DB*>(kInstanceNum);
for (int i = 0; i < kInstanceNum; i++) {
ASSERT_OK(
DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
}
// Make sure the second instance has the worker enabled
auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
WriteOptions wo;
for (int i = 0; i < 200; i++) {
ASSERT_OK(dbi->Put(wo, Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
}
SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping();
ASSERT_GT(seqno_to_time_mapping.Size(), 10);
for (int i = 0; i < kInstanceNum; i++) {
ASSERT_OK(dbs[i]->Close());
delete dbs[i];
}
}
TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
const int kNumTrigger = 4;
const int kNumLevels = 7;
const int kNumKeys = 100;
Options options = CurrentOptions();
SetTrackTimeDurationOptions(10000, options);
options.compaction_style = kCompactionStyleUniversal;
options.num_levels = kNumLevels;
options.env = mock_env_.get();
DestroyAndReopen(options);
std::atomic_uint64_t num_seqno_zeroing{0};
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
SyncPoint::GetInstance()->SetCallBack(
"CompactionIterator::PrepareOutput:ZeroingSeq",
[&](void* /*arg*/) { num_seqno_zeroing++; });
SyncPoint::GetInstance()->EnableProcessing();
int sst_num = 0;
for (; sst_num < kNumTrigger - 1; sst_num++) {
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
ASSERT_OK(Flush());
}
TablePropertiesCollection tables_props;
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 3);
for (const auto& props : tables_props) {
ASSERT_FALSE(props.second->seqno_to_time_mapping.empty());
SeqnoToTimeMapping tp_mapping;
ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping));
ASSERT_OK(tp_mapping.Sort());
ASSERT_FALSE(tp_mapping.Empty());
auto seqs = tp_mapping.TEST_GetInternalMapping();
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
// Add (roughly) one for starting entry.
ASSERT_GE(seqs.size(), 10);
ASSERT_LE(seqs.size(), 10 + 2);
}
// Trigger a compaction
for (int i = 0; i < kNumKeys; i++) {
ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
}
sst_num++;
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact());
tables_props.clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
auto it = tables_props.begin();
SeqnoToTimeMapping tp_mapping;
ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
// compact to the last level
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// make sure the data is all compacted to penultimate level if the feature is
// on, otherwise, compacted to the last level.
if (options.preclude_last_level_data_seconds > 0) {
ASSERT_GT(NumTableFilesAtLevel(5), 0);
ASSERT_EQ(NumTableFilesAtLevel(6), 0);
} else {
ASSERT_EQ(NumTableFilesAtLevel(5), 0);
ASSERT_GT(NumTableFilesAtLevel(6), 0);
}
// regardless the file is on the last level or not, it should keep the time
// information and sequence number are not set
tables_props.clear();
tp_mapping.Clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
ASSERT_EQ(tables_props.size(), 1);
ASSERT_EQ(num_seqno_zeroing, 0);
it = tables_props.begin();
ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
// make half of the data expired
mock_clock_->MockSleepForSeconds(static_cast<int>(8000));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
tables_props.clear();
tp_mapping.Clear();
ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
if (options.preclude_last_level_data_seconds > 0) {
ASSERT_EQ(tables_props.size(), 2);
} else {
ASSERT_EQ(tables_props.size(), 1);
}
ASSERT_GT(num_seqno_zeroing, 0);
std::vector<KeyVersion> key_versions;
ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
std::numeric_limits<size_t>::max(),
&key_versions));
// make sure there're more than 300 keys and first 100 keys are having seqno
// zeroed out, the last 100 key seqno not zeroed out
ASSERT_GT(key_versions.size(), 300);
for (int i = 0; i < 100; i++) {
ASSERT_EQ(key_versions[i].sequence, 0);
}
auto rit = key_versions.rbegin();
for (int i = 0; i < 100; i++) {
ASSERT_GT(rit->sequence, 0);
rit++;
}
// make all data expired and compact again to push it to the last level
// regardless if the tiering feature is enabled or not
mock_clock_->MockSleepForSeconds(static_cast<int>(20000));
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_GT(num_seqno_zeroing, 0);
ASSERT_GT(NumTableFilesAtLevel(6), 0);
Close();
}
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) {
Options base_options = CurrentOptions();
base_options.env = mock_env_.get();
base_options.disable_auto_compactions = true;
base_options.create_missing_column_families = true;
Options track_options = base_options;
constexpr uint32_t kPreserveSecs = 1234567;
SetTrackTimeDurationOptions(kPreserveSecs, track_options);
SeqnoToTimeMapping sttm;
SequenceNumber latest_seqno;
uint64_t start_time, end_time;
// #### DB#1, #2: No pre-population without preserve/preclude ####
// #### But a single entry is added when preserve/preclude enabled ####
for (bool with_write : {false, true}) {
SCOPED_TRACE("with_write=" + std::to_string(with_write));
DestroyAndReopen(base_options);
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_TRUE(sttm.Empty());
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
if (with_write) {
// Ensure that writes before new CF with preserve/preclude option don't
// interfere with the seqno-to-time mapping getting a starting entry.
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
Use manifest to persist pre-allocated seqnos (#11995) Summary: ... and other fixes for crash test after https://github.com/facebook/rocksdb/issues/11922. * When pre-allocating sequence numbers for establishing a time history, record that last sequence number in the manifest so that it is (most likely) restored on recovery even if no user writes were made or were recovered (e.g. no WAL). * When pre-allocating sequence numbers for establishing a time history, only do this for actually new DBs. * Remove the feature that ensures non-zero sequence number on creating the first column family with preserve/preclude option after initial DB::Open. Until fixed in a way compatible with the crash test, this creates a gap where some data written with active preserve/preclude option won't have a known associated time. Together, these ensure we don't upset the crash test by manipulating sequence numbers after initial DB creation (esp when re-opening with different options). (The crash test expects that the seqno after re-open corresponds to a known point in time from previous crash test operation, matching an expected DB state.) Follow-up work: * Re-fill the gap to ensure all data written under preserve/preclude settings have a known time estimate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11995 Test Plan: Added to unit test SeqnoTimeTablePropTest.PrePopulateInDB Verified fixes two crash test scenarios: ## 1st reproducer First apply ``` diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index b483e154c..ef63b8d6c 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -333,6 +333,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path, &trace_writer); } + if (getenv("CRASH")) assert(false); if (s.ok()) { TraceOptions trace_opts; trace_opts.filter |= kTraceFilterGet; ``` Then ``` mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_expected mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_whitebox rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=36000 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=0 ``` Without the fix you get ``` ... DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs Error restoring historical expected values: Corruption: DB is older than any restorable expected state ``` ## 2nd reproducer First apply ``` diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 62ddead7b..f2654980f 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1126,6 +1126,7 @@ void StressTest::OperateDb(ThreadState* thread) { // OPERATION write TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys, value); + if (getenv("CRASH")) assert(false); } else if (prob_op < del_bound) { assert(write_bound <= prob_op); // OPERATION delete ``` Then ``` rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=0 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=3600 ``` Without the fix you get ``` DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs db_stress: db_stress_tool/expected_state.cc:380: virtual rocksdb::{anonymous}::ExpectedStateTraceRecordHandler::~ ExpectedStateTraceRecordHandler(): Assertion `IsDone()' failed. ``` Reviewed By: jowlyzhang Differential Revision: D50533346 Pulled By: pdillinger fbshipit-source-id: 1056be45c5b9e537c8c601b28c4b27431a782477
2023-10-23 16:20:59 +00:00
} else {
// FIXME: currently, starting entry after CreateColumnFamily requires
// non-zero seqno
ASSERT_OK(Delete("blah"));
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
}
// Unfortunately, if we add a CF with preserve/preclude option after
// open, that does not reserve seqnos with pre-populated time mappings.
CreateColumnFamilies({"one"}, track_options);
// No pre-population (unfortunately), just a single starting entry
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), 1);
ASSERT_EQ(latest_seqno, 1U);
// Current time maps to starting entry / seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U);
// Any older times are unknown.
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1),
kUnknownSeqnoBeforeAll);
// Now check that writes can proceed normally (passing about 20% of preserve
// time)
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kPreserveSecs / 99));
});
}
ASSERT_OK(Flush());
// Check that mappings are getting populated
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
end_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), 21);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1),
kUnknownSeqnoBeforeAll);
}
// ### DB#3, #4: Read-only DB with preserve/preclude after not ####
// Make sure we don't hit issues with read-only DBs, which don't need
// the mapping in the DB state (though it wouldn't hurt anything)
for (bool with_write : {false, true}) {
SCOPED_TRACE("with_write=" + std::to_string(with_write));
DestroyAndReopen(base_options);
if (with_write) {
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
}
ASSERT_OK(ReadOnlyReopen(base_options));
if (with_write) {
ASSERT_EQ(Get("foo"), "bar");
}
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
Use manifest to persist pre-allocated seqnos (#11995) Summary: ... and other fixes for crash test after https://github.com/facebook/rocksdb/issues/11922. * When pre-allocating sequence numbers for establishing a time history, record that last sequence number in the manifest so that it is (most likely) restored on recovery even if no user writes were made or were recovered (e.g. no WAL). * When pre-allocating sequence numbers for establishing a time history, only do this for actually new DBs. * Remove the feature that ensures non-zero sequence number on creating the first column family with preserve/preclude option after initial DB::Open. Until fixed in a way compatible with the crash test, this creates a gap where some data written with active preserve/preclude option won't have a known associated time. Together, these ensure we don't upset the crash test by manipulating sequence numbers after initial DB creation (esp when re-opening with different options). (The crash test expects that the seqno after re-open corresponds to a known point in time from previous crash test operation, matching an expected DB state.) Follow-up work: * Re-fill the gap to ensure all data written under preserve/preclude settings have a known time estimate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11995 Test Plan: Added to unit test SeqnoTimeTablePropTest.PrePopulateInDB Verified fixes two crash test scenarios: ## 1st reproducer First apply ``` diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index b483e154c..ef63b8d6c 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -333,6 +333,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path, &trace_writer); } + if (getenv("CRASH")) assert(false); if (s.ok()) { TraceOptions trace_opts; trace_opts.filter |= kTraceFilterGet; ``` Then ``` mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_expected mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_whitebox rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=36000 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=0 ``` Without the fix you get ``` ... DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs Error restoring historical expected values: Corruption: DB is older than any restorable expected state ``` ## 2nd reproducer First apply ``` diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 62ddead7b..f2654980f 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1126,6 +1126,7 @@ void StressTest::OperateDb(ThreadState* thread) { // OPERATION write TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys, value); + if (getenv("CRASH")) assert(false); } else if (prob_op < del_bound) { assert(write_bound <= prob_op); // OPERATION delete ``` Then ``` rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=0 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=3600 ``` Without the fix you get ``` DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs db_stress: db_stress_tool/expected_state.cc:380: virtual rocksdb::{anonymous}::ExpectedStateTraceRecordHandler::~ ExpectedStateTraceRecordHandler(): Assertion `IsDone()' failed. ``` Reviewed By: jowlyzhang Differential Revision: D50533346 Pulled By: pdillinger fbshipit-source-id: 1056be45c5b9e537c8c601b28c4b27431a782477
2023-10-23 16:20:59 +00:00
if (!with_write) {
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
}
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
ASSERT_OK(ReadOnlyReopen(track_options));
if (with_write) {
ASSERT_EQ(Get("foo"), "bar");
}
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
Use manifest to persist pre-allocated seqnos (#11995) Summary: ... and other fixes for crash test after https://github.com/facebook/rocksdb/issues/11922. * When pre-allocating sequence numbers for establishing a time history, record that last sequence number in the manifest so that it is (most likely) restored on recovery even if no user writes were made or were recovered (e.g. no WAL). * When pre-allocating sequence numbers for establishing a time history, only do this for actually new DBs. * Remove the feature that ensures non-zero sequence number on creating the first column family with preserve/preclude option after initial DB::Open. Until fixed in a way compatible with the crash test, this creates a gap where some data written with active preserve/preclude option won't have a known associated time. Together, these ensure we don't upset the crash test by manipulating sequence numbers after initial DB creation (esp when re-opening with different options). (The crash test expects that the seqno after re-open corresponds to a known point in time from previous crash test operation, matching an expected DB state.) Follow-up work: * Re-fill the gap to ensure all data written under preserve/preclude settings have a known time estimate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11995 Test Plan: Added to unit test SeqnoTimeTablePropTest.PrePopulateInDB Verified fixes two crash test scenarios: ## 1st reproducer First apply ``` diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index b483e154c..ef63b8d6c 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -333,6 +333,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path, &trace_writer); } + if (getenv("CRASH")) assert(false); if (s.ok()) { TraceOptions trace_opts; trace_opts.filter |= kTraceFilterGet; ``` Then ``` mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_expected mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_whitebox rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=36000 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=0 ``` Without the fix you get ``` ... DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs Error restoring historical expected values: Corruption: DB is older than any restorable expected state ``` ## 2nd reproducer First apply ``` diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 62ddead7b..f2654980f 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1126,6 +1126,7 @@ void StressTest::OperateDb(ThreadState* thread) { // OPERATION write TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys, value); + if (getenv("CRASH")) assert(false); } else if (prob_op < del_bound) { assert(write_bound <= prob_op); // OPERATION delete ``` Then ``` rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=0 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=3600 ``` Without the fix you get ``` DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs db_stress: db_stress_tool/expected_state.cc:380: virtual rocksdb::{anonymous}::ExpectedStateTraceRecordHandler::~ ExpectedStateTraceRecordHandler(): Assertion `IsDone()' failed. ``` Reviewed By: jowlyzhang Differential Revision: D50533346 Pulled By: pdillinger fbshipit-source-id: 1056be45c5b9e537c8c601b28c4b27431a782477
2023-10-23 16:20:59 +00:00
if (!with_write) {
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
// And even if we re-open read-write, we do not get pre-population,
// because that's only for new DBs.
Reopen(track_options);
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
}
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
}
// #### DB#5: Destroy and open with preserve/preclude option ####
DestroyAndReopen(track_options);
// Ensure pre-population
constexpr auto kPrePopPairs = SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST;
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
// One nono-zero sequence number per pre-populated pair (this could be
// revised if we want to use interpolation for better approximate time
// mappings with no guarantee of erring in just one direction).
ASSERT_EQ(latest_seqno, kPrePopPairs);
// Current time maps to last pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno);
// Oldest tracking time maps to first pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1);
// In more detail, check that estimated seqnos (pre-allocated) are uniformly
// spread over the tracked time.
for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) {
// Round up query time
uint64_t t = start_time - kPreserveSecs +
static_cast<uint64_t>(ratio * kPreserveSecs + 0.9999999);
// Round down estimated seqno
SequenceNumber s =
static_cast<SequenceNumber>(ratio * (latest_seqno - 1)) + 1;
// Match
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(t), s);
}
// Now check that writes can proceed normally (passing about 20% of preserve
// time)
for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(i), "value"));
dbfull()->TEST_WaitForPeriodicTaskRun([&] {
mock_clock_->MockSleepForSeconds(static_cast<int>(kPreserveSecs / 99));
});
}
ASSERT_OK(Flush());
// Can still see some pre-populated mappings, though some displaced
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
end_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs / 2),
kPrePopPairs / 2);
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs),
kUnknownSeqnoBeforeAll);
// Make sure we don't hit issues with read-only DBs, which don't need
// the mapping in the DB state (though it wouldn't hurt anything)
ASSERT_OK(ReadOnlyReopen(track_options));
ASSERT_EQ(Get(Key(0)), "value");
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
ASSERT_EQ(sttm.Size(), 0);
// #### DB#6: Destroy and open+create an extra CF with preserve/preclude ####
// (default CF does not have the option)
Destroy(track_options);
ReopenWithColumnFamilies({"default", "one"},
List({base_options, track_options}));
// Ensure pre-population (not as exhaustive checking here)
sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
latest_seqno = db_->GetLatestSequenceNumber();
start_time = mock_clock_->NowSeconds();
ASSERT_EQ(sttm.Size(), kPrePopPairs);
// One nono-zero sequence number per pre-populated pair (this could be
// revised if we want to use interpolation for better approximate time
// mappings with no guarantee of erring in just one direction).
ASSERT_EQ(latest_seqno, kPrePopPairs);
// Current time maps to last pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno);
// Oldest tracking time maps to first pre-allocated seqno
ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1);
Use manifest to persist pre-allocated seqnos (#11995) Summary: ... and other fixes for crash test after https://github.com/facebook/rocksdb/issues/11922. * When pre-allocating sequence numbers for establishing a time history, record that last sequence number in the manifest so that it is (most likely) restored on recovery even if no user writes were made or were recovered (e.g. no WAL). * When pre-allocating sequence numbers for establishing a time history, only do this for actually new DBs. * Remove the feature that ensures non-zero sequence number on creating the first column family with preserve/preclude option after initial DB::Open. Until fixed in a way compatible with the crash test, this creates a gap where some data written with active preserve/preclude option won't have a known associated time. Together, these ensure we don't upset the crash test by manipulating sequence numbers after initial DB creation (esp when re-opening with different options). (The crash test expects that the seqno after re-open corresponds to a known point in time from previous crash test operation, matching an expected DB state.) Follow-up work: * Re-fill the gap to ensure all data written under preserve/preclude settings have a known time estimate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11995 Test Plan: Added to unit test SeqnoTimeTablePropTest.PrePopulateInDB Verified fixes two crash test scenarios: ## 1st reproducer First apply ``` diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index b483e154c..ef63b8d6c 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -333,6 +333,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path, &trace_writer); } + if (getenv("CRASH")) assert(false); if (s.ok()) { TraceOptions trace_opts; trace_opts.filter |= kTraceFilterGet; ``` Then ``` mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_expected mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_whitebox rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=36000 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=0 ``` Without the fix you get ``` ... DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs Error restoring historical expected values: Corruption: DB is older than any restorable expected state ``` ## 2nd reproducer First apply ``` diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 62ddead7b..f2654980f 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -1126,6 +1126,7 @@ void StressTest::OperateDb(ThreadState* thread) { // OPERATION write TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys, value); + if (getenv("CRASH")) assert(false); } else if (prob_op < del_bound) { assert(write_bound <= prob_op); // OPERATION delete ``` Then ``` rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/* CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=0 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=3600 ``` Without the fix you get ``` DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox] (Re-)verified 34 unique IDs db_stress: db_stress_tool/expected_state.cc:380: virtual rocksdb::{anonymous}::ExpectedStateTraceRecordHandler::~ ExpectedStateTraceRecordHandler(): Assertion `IsDone()' failed. ``` Reviewed By: jowlyzhang Differential Revision: D50533346 Pulled By: pdillinger fbshipit-source-id: 1056be45c5b9e537c8c601b28c4b27431a782477
2023-10-23 16:20:59 +00:00
// Even after no writes and DB re-open without tracking options, sequence
// numbers should not go backward into those that were pre-allocated.
// (Future work: persist the mapping)
ReopenWithColumnFamilies({"default", "one"},
List({base_options, base_options}));
ASSERT_EQ(latest_seqno, db_->GetLatestSequenceNumber());
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
Close();
}
TEST_F(SeqnoTimeTest, MappingAppend) {
SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
// ignore seqno == 0, as it may mean the seqno is zeroed out
ASSERT_FALSE(test.Append(0, 9));
ASSERT_TRUE(test.Append(3, 10));
auto size = test.Size();
// normal add
ASSERT_TRUE(test.Append(10, 11));
size++;
ASSERT_EQ(size, test.Size());
// Append unsorted
ASSERT_FALSE(test.Append(8, 12));
ASSERT_EQ(size, test.Size());
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
// Append with the same seqno, newer time is rejected because that makes
// GetProximalSeqnoBeforeTime queries worse (see later test)
ASSERT_FALSE(test.Append(10, 12));
ASSERT_EQ(size, test.Size());
// older time will be ignored
ASSERT_FALSE(test.Append(10, 9));
ASSERT_EQ(size, test.Size());
// new seqno with old time will be ignored
ASSERT_FALSE(test.Append(12, 8));
ASSERT_EQ(size, test.Size());
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
// new seqno with same time is accepted by replacing last entry
// (improves GetProximalSeqnoBeforeTime queries without blowing up size)
ASSERT_TRUE(test.Append(12, 11));
ASSERT_EQ(size, test.Size());
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
TEST_F(SeqnoTimeTest, ProximalFunctions) {
SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U),
kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U),
kUnknownSeqnoBeforeAll);
// (Taken from example in SeqnoToTimeMapping class comment)
// Time 500 is after seqno 10 and before seqno 11
EXPECT_TRUE(test.Append(10, 500));
// Seqno too early
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(9), kUnknownTimeBeforeAll);
// We only know that 500 is after 10
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
// Found
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 500U);
// Time too early
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll);
// Found
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 10U);
// More samples
EXPECT_TRUE(test.Append(20, 600));
EXPECT_TRUE(test.Append(30, 700));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(20), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(21), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(601), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(701), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 30U);
// Redundant sample ignored
EXPECT_EQ(test.Size(), 3U);
EXPECT_FALSE(test.Append(30, 700));
EXPECT_EQ(test.Size(), 3U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// Later sample with same seqno is ignored, to provide best results
// for GetProximalSeqnoBeforeTime function while saving entries
// in SeqnoToTimeMapping.
EXPECT_FALSE(test.Append(30, 800));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
// Could return 800, but saving space in SeqnoToTimeMapping instead.
// Can reconsider if/when GetProximalTimeBeforeSeqno is used in
// production.
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
// If the existing {30, 700} entry were replaced with {30, 800}, this
// would return seqno 20 instead of 30, which would preclude more than
// necessary for "preclude_last_level_data_seconds" feature.
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(800), 30U);
// Still OK
EXPECT_TRUE(test.Append(40, 900));
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(41), 900U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 40U);
// Burst of writes during a short time creates an opportunity
// for better results from GetProximalSeqnoBeforeTime(), at the
// expense of GetProximalTimeBeforeSeqno().
EXPECT_TRUE(test.Append(50, 900));
// These are subject to later revision depending on priorities
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(49), 700U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(51), 900U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 50U);
}
Bootstrap, pre-populate seqno_to_time_mapping (#11922) Summary: This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920): * Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests. * For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time. Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.) Recommended follow-up: * Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect. * Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.) No release note for the minor bug fix because this is still an experimental feature with limited usage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922 Test Plan: tests added / updated Reviewed By: jowlyzhang Differential Revision: D49956563 Pulled By: pdillinger fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
TEST_F(SeqnoTimeTest, PrePopulate) {
SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
EXPECT_EQ(test.Size(), 0U);
// Smallest case is like two Appends
test.PrePopulate(10, 11, 500, 600);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U);
EXPECT_EQ(test.GetProximalTimeBeforeSeqno(12), 600U);
test.Clear();
// Populate a small range
uint64_t kTimeIncrement = 1234567;
test.PrePopulate(1, 12, kTimeIncrement, kTimeIncrement * 2);
for (uint64_t i = 0; i <= 12; ++i) {
// NOTE: with 1 and 12 as the pre-populated end points, the duration is
// broken into 11 equal(-ish) spans
uint64_t t = kTimeIncrement + (i * kTimeIncrement) / 11 - 1;
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), i);
}
test.Clear();
// Populate an excessively large range (in the future we might want to
// interpolate estimated times for seqnos between entries)
test.PrePopulate(1, 34567, kTimeIncrement, kTimeIncrement * 2);
for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) {
// Round up query time
uint64_t t = kTimeIncrement +
static_cast<uint64_t>(ratio * kTimeIncrement + 0.9999999);
// Round down estimated seqno
SequenceNumber s = static_cast<SequenceNumber>(ratio * (34567 - 1)) + 1;
// Match
// TODO: for now this is exact, but in the future might need approximation
// bounds to account for limited samples.
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), s);
}
}
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
TEST_F(SeqnoTimeTest, TruncateOldEntries) {
constexpr uint64_t kMaxTimeDuration = 42;
SeqnoToTimeMapping test(kMaxTimeDuration, /*max_capacity=*/10);
EXPECT_EQ(test.Size(), 0U);
// Safe on empty mapping
test.TruncateOldEntries(500);
EXPECT_EQ(test.Size(), 0U);
// (Taken from example in SeqnoToTimeMapping class comment)
// Time 500 is after seqno 10 and before seqno 11
EXPECT_TRUE(test.Append(10, 500));
EXPECT_TRUE(test.Append(20, 600));
EXPECT_TRUE(test.Append(30, 700));
EXPECT_TRUE(test.Append(40, 800));
EXPECT_TRUE(test.Append(50, 900));
EXPECT_EQ(test.Size(), 5U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// etc.
// Must keep first entry
test.TruncateOldEntries(500 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 5U);
test.TruncateOldEntries(599 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 5U);
// Purges first entry
test.TruncateOldEntries(600 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 4U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U);
// No effect
test.TruncateOldEntries(600 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 4U);
test.TruncateOldEntries(699 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 4U);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
// Purges next two
test.TruncateOldEntries(899 + kMaxTimeDuration);
EXPECT_EQ(test.Size(), 2U);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(799), kUnknownSeqnoBeforeAll);
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 40U);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
// Always keep last entry, to have a non-trivial seqno bound
test.TruncateOldEntries(10000000);
EXPECT_EQ(test.Size(), 1U);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
EXPECT_EQ(test.GetProximalSeqnoBeforeTime(10000000), 50U);
}
TEST_F(SeqnoTimeTest, Sort) {
SeqnoToTimeMapping test;
// single entry
test.Add(10, 11);
ASSERT_OK(test.Sort());
ASSERT_EQ(test.Size(), 1);
// duplicate, should be removed by sort
test.Add(10, 11);
// same seqno, but older time, should be removed
test.Add(10, 9);
// unuseful ones, should be removed by sort
test.Add(11, 9);
test.Add(9, 8);
// Good ones
test.Add(1, 10);
test.Add(100, 100);
ASSERT_OK(test.Sort());
auto seqs = test.TEST_GetInternalMapping();
std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
expected.emplace_back(1, 10);
expected.emplace_back(10, 11);
expected.emplace_back(100, 100);
ASSERT_EQ(expected, seqs);
}
TEST_F(SeqnoTimeTest, EncodeDecodeBasic) {
SeqnoToTimeMapping test(0, 1000);
std::string output;
test.Encode(output, 0, 1000, 100);
ASSERT_TRUE(output.empty());
for (int i = 1; i <= 1000; i++) {
ASSERT_TRUE(test.Append(i, i * 10));
}
test.Encode(output, 0, 1000, 100);
ASSERT_FALSE(output.empty());
SeqnoToTimeMapping decoded;
ASSERT_OK(decoded.Add(output));
ASSERT_OK(decoded.Sort());
ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST);
ASSERT_EQ(test.Size(), 1000);
for (SequenceNumber seq = 0; seq <= 1000; seq++) {
// test has the more accurate time mapping, encode only pick
// kMaxSeqnoTimePairsPerSST number of entries, which is less accurate
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
uint64_t target_time = test.GetProximalTimeBeforeSeqno(seq);
ASSERT_GE(decoded.GetProximalTimeBeforeSeqno(seq),
target_time < 200 ? 0 : target_time - 200);
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905) Summary: This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up** However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping. Functional fixes / changes: * This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc * That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.) * Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function. * Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples. Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work. An apparent compaction bug is revealed in PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909 Cosmetic / code safety things (not exhaustive): * Fix some confusing names. * `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name. * Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing. * Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions. * Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind. * (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.) * A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`). * Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905 Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others. Reviewed By: jowlyzhang Differential Revision: D49755592 Pulled By: pdillinger fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
ASSERT_LE(decoded.GetProximalTimeBeforeSeqno(seq), target_time);
}
}
TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) {
SeqnoToTimeMapping test(0, 10);
test.Append(1, 10);
test.Append(5, 17);
test.Append(6, 25);
test.Append(8, 30);
std::string output;
test.Encode(output, 1, 10, 0, 3);
SeqnoToTimeMapping decoded;
ASSERT_OK(decoded.Add(output));
ASSERT_OK(decoded.Sort());
ASSERT_EQ(decoded.Size(), 3);
auto seqs = decoded.TEST_GetInternalMapping();
std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
expected.emplace_back(1, 10);
expected.emplace_back(6, 25);
expected.emplace_back(8, 30);
ASSERT_EQ(expected, seqs);
// Add a few large time number
test.Append(10, 100);
test.Append(13, 200);
test.Append(16, 300);
output.clear();
test.Encode(output, 1, 20, 0, 4);
decoded.Clear();
ASSERT_OK(decoded.Add(output));
ASSERT_OK(decoded.Sort());
ASSERT_EQ(decoded.Size(), 4);
expected.clear();
expected.emplace_back(1, 10);
// entry #6, #8 are skipped as they are too close to #1.
// entry #100 is also within skip range, but if it's skipped, there not enough
// number to fill 4 entries, so select it.
expected.emplace_back(10, 100);
expected.emplace_back(13, 200);
expected.emplace_back(16, 300);
seqs = decoded.TEST_GetInternalMapping();
ASSERT_EQ(expected, seqs);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}