Support options.max_open_files = -1 with periodic_compaction_seconds (#6090)

Summary:
options.periodic_compaction_seconds isn't supported when options.max_open_files != -1. It's because that the information of file creation time is stored in table properties and are not guaranteed to be loaded unless options.max_open_files = -1. Relax this constraint by storing the information in manifest.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6090

Test Plan: Pass all existing tests; Modify an existing test to force the manifest value to take 0 to simulate backward compatibility case; manually open the DB generated with the change by release 4.2.

Differential Revision: D18702268

fbshipit-source-id: 13e0bd94f546498a04f3dc5fc0d9dff5125ec9eb
This commit is contained in:
sdong 2019-11-26 21:38:38 -08:00 committed by Facebook Github Bot
parent 496a6ae895
commit aa1857e2df
20 changed files with 266 additions and 141 deletions

View File

@ -19,8 +19,9 @@
* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
* Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties.
* Support options.ttl or options.periodic_compaction_seconds with options.max_open_files = -1. File's oldest ancester time and file creation time will be written to manifest. If it is availalbe, this information will be used instead of creation_time and file_creation_time in table properties.
* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
* SstFileMetaData also returns file creation time and oldest ancester time.
* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.

View File

@ -1501,6 +1501,7 @@ Status CompactionJob::OpenCompactionOutputFile(
out.meta.fd = FileDescriptor(file_number,
sub_compact->compaction->output_path_id(), 0);
out.meta.oldest_ancester_time = oldest_ancester_time;
out.meta.file_creation_time = current_time;
out.finished = false;
sub_compact->outputs.push_back(out);
}

View File

@ -184,7 +184,7 @@ class CompactionJobTest : public testing::Test {
VersionEdit edit;
edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
smallest_seqno, largest_seqno, false, oldest_blob_file_number,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
mutex_.Lock();
versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),

View File

@ -95,7 +95,7 @@ class CompactionPickerTest : public testing::Test {
InternalKey(smallest, smallest_seq, kTypeValue),
InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
f->compensated_file_size =
(compensated_file_size != 0) ? compensated_file_size : file_size;
vstorage_->AddFile(level, f);

View File

@ -3657,71 +3657,103 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
const int kNumLevelFiles = 2;
const int kValueSize = 100;
Options options = CurrentOptions();
options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
options.max_open_files = -1; // needed for ttl compaction
env_->time_elapse_only_sleep_ = false;
options.env = env_;
for (bool if_restart : {false, true}) {
for (bool if_open_all_files : {false, true}) {
Options options = CurrentOptions();
options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days
if (if_open_all_files) {
options.max_open_files = -1; // needed for ttl compaction
} else {
options.max_open_files = 20;
}
// RocksDB sanitize max open files to at least 20. Modify it back.
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
int* max_open_files = static_cast<int*>(arg);
*max_open_files = 0;
});
// In the case where all files are opened and doing DB restart
// forcing the file creation time in manifest file to be 0 to
// simulate the case of reading from an old version.
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
if (if_restart && if_open_all_files) {
std::string* encoded_fieled = static_cast<std::string*>(arg);
*encoded_fieled = "";
PutVarint64(encoded_fieled, 0);
}
});
env_->addon_time_.store(0);
DestroyAndReopen(options);
env_->time_elapse_only_sleep_ = false;
options.env = env_;
int periodic_compactions = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
auto compaction_reason = compaction->compaction_reason();
if (compaction_reason == CompactionReason::kPeriodicCompaction) {
periodic_compactions++;
env_->addon_time_.store(0);
DestroyAndReopen(options);
int periodic_compactions = 0;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
Compaction* compaction = reinterpret_cast<Compaction*>(arg);
auto compaction_reason = compaction->compaction_reason();
if (compaction_reason == CompactionReason::kPeriodicCompaction) {
periodic_compactions++;
}
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
Random rnd(301);
for (int i = 0; i < kNumLevelFiles; ++i) {
for (int j = 0; j < kNumKeysPerFile; ++j) {
ASSERT_OK(Put(Key(i * kNumKeysPerFile + j),
RandomString(&rnd, kValueSize)));
}
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
Flush();
}
dbfull()->TEST_WaitForCompact();
Random rnd(301);
for (int i = 0; i < kNumLevelFiles; ++i) {
for (int j = 0; j < kNumKeysPerFile; ++j) {
ASSERT_OK(
Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
ASSERT_EQ("2", FilesPerLevel());
ASSERT_EQ(0, periodic_compactions);
// Add 50 hours and do a write
env_->addon_time_.fetch_add(50 * 60 * 60);
ASSERT_OK(Put("a", "1"));
Flush();
dbfull()->TEST_WaitForCompact();
// Assert that the files stay in the same level
ASSERT_EQ("3", FilesPerLevel());
// The two old files go through the periodic compaction process
ASSERT_EQ(2, periodic_compactions);
MoveFilesToLevel(1);
ASSERT_EQ("0,3", FilesPerLevel());
// Add another 50 hours and do another write
env_->addon_time_.fetch_add(50 * 60 * 60);
ASSERT_OK(Put("b", "2"));
if (if_restart) {
Reopen(options);
} else {
Flush();
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ("1,3", FilesPerLevel());
// The three old files now go through the periodic compaction process. 2
// + 3.
ASSERT_EQ(5, periodic_compactions);
// Add another 50 hours and do another write
env_->addon_time_.fetch_add(50 * 60 * 60);
ASSERT_OK(Put("c", "3"));
Flush();
dbfull()->TEST_WaitForCompact();
ASSERT_EQ("2,3", FilesPerLevel());
// The four old files now go through the periodic compaction process. 5
// + 4.
ASSERT_EQ(9, periodic_compactions);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
}
Flush();
}
dbfull()->TEST_WaitForCompact();
ASSERT_EQ("2", FilesPerLevel());
ASSERT_EQ(0, periodic_compactions);
// Add 50 hours and do a write
env_->addon_time_.fetch_add(50 * 60 * 60);
ASSERT_OK(Put("a", "1"));
Flush();
dbfull()->TEST_WaitForCompact();
// Assert that the files stay in the same level
ASSERT_EQ("3", FilesPerLevel());
// The two old files go through the periodic compaction process
ASSERT_EQ(2, periodic_compactions);
MoveFilesToLevel(1);
ASSERT_EQ("0,3", FilesPerLevel());
// Add another 50 hours and do another write
env_->addon_time_.fetch_add(50 * 60 * 60);
ASSERT_OK(Put("b", "2"));
Flush();
dbfull()->TEST_WaitForCompact();
ASSERT_EQ("1,3", FilesPerLevel());
// The three old files now go through the periodic compaction process. 2 + 3.
ASSERT_EQ(5, periodic_compactions);
// Add another 50 hours and do another write
env_->addon_time_.fetch_add(50 * 60 * 60);
ASSERT_OK(Put("c", "3"));
Flush();
dbfull()->TEST_WaitForCompact();
ASSERT_EQ("2,3", FilesPerLevel());
// The four old files now go through the periodic compaction process. 5 + 4.
ASSERT_EQ(9, periodic_compactions);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
}
TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
@ -3734,7 +3766,6 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
const int kValueSize = 100;
Options options = CurrentOptions();
options.max_open_files = -1; // needed for ttl compaction
env_->time_elapse_only_sleep_ = false;
options.env = env_;

View File

@ -1257,7 +1257,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->oldest_blob_file_number,
f->oldest_ancester_time);
f->oldest_ancester_time, f->file_creation_time);
}
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@ -2672,7 +2672,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
f->largest, f->fd.smallest_seqno,
f->fd.largest_seqno, f->marked_for_compaction,
f->oldest_blob_file_number, f->oldest_ancester_time);
f->oldest_blob_file_number, f->oldest_ancester_time,
f->file_creation_time);
ROCKS_LOG_BUFFER(
log_buffer,

View File

@ -129,7 +129,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->oldest_blob_file_number,
f->oldest_ancester_time);
f->oldest_ancester_time, f->file_creation_time);
}
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),

View File

@ -1226,7 +1226,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
meta.fd.GetFileSize(), meta.smallest, meta.largest,
meta.fd.smallest_seqno, meta.fd.largest_seqno,
meta.marked_for_compaction, meta.oldest_blob_file_number,
meta.oldest_ancester_time);
meta.oldest_ancester_time, meta.file_creation_time);
}
InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);

View File

@ -1022,7 +1022,8 @@ TEST_F(DBTest, FailMoreDbPaths) {
void CheckColumnFamilyMeta(
const ColumnFamilyMetaData& cf_meta,
const std::vector<std::vector<FileMetaData>>& files_by_level) {
const std::vector<std::vector<FileMetaData>>& files_by_level,
uint64_t start_time, uint64_t end_time) {
ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
@ -1060,6 +1061,14 @@ void CheckColumnFamilyMeta(
file_meta_from_files.largest.user_key().ToString());
ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
file_meta_from_files.oldest_blob_file_number);
ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
file_meta_from_files.oldest_ancester_time);
ASSERT_EQ(file_meta_from_cf.file_creation_time,
file_meta_from_files.file_creation_time);
ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
}
ASSERT_EQ(level_meta_from_cf.size, level_size);
@ -1113,6 +1122,11 @@ TEST_F(DBTest, MetaDataTest) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.disable_auto_compactions = true;
int64_t temp_time = 0;
options.env->GetCurrentTime(&temp_time);
uint64_t start_time = static_cast<uint64_t>(temp_time);
DestroyAndReopen(options);
Random rnd(301);
@ -1139,9 +1153,12 @@ TEST_F(DBTest, MetaDataTest) {
std::vector<std::vector<FileMetaData>> files_by_level;
dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
options.env->GetCurrentTime(&temp_time);
uint64_t end_time = static_cast<uint64_t>(temp_time);
ColumnFamilyMetaData cf_meta;
db_->GetColumnFamilyMetaData(&cf_meta);
CheckColumnFamilyMeta(cf_meta, files_by_level);
CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time);
std::vector<LiveFileMetaData> live_file_meta;
db_->GetLiveFilesMetaData(&live_file_meta);
@ -6420,6 +6437,12 @@ TEST_F(DBTest, CreationTimeOfOldestFile) {
}
}
});
// Set file creation time in manifest all to 0.
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"FileMetaData::FileMetaData", [&](void* arg) {
FileMetaData* meta = static_cast<FileMetaData*>(arg);
meta->file_creation_time = 0;
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
Random rnd(301);
@ -6431,7 +6454,7 @@ TEST_F(DBTest, CreationTimeOfOldestFile) {
Flush();
}
// At this point there should be 2 files, oen with file_creation_time = 0 and
// At this point there should be 2 files, one with file_creation_time = 0 and
// the other non-zero. GetCreationTimeOfOldestFile API should return 0.
uint64_t creation_time;
Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);

View File

@ -246,16 +246,19 @@ Status ExternalSstFileIngestionJob::Run() {
// We use the import time as the ancester time. This is the time the data
// is written to the database.
uint64_t oldest_ancester_time = 0;
int64_t temp_current_time = 0;
uint64_t current_time = kUnknownFileCreationTime;
uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
if (env_->GetCurrentTime(&temp_current_time).ok()) {
oldest_ancester_time = static_cast<uint64_t>(temp_current_time);
current_time = oldest_ancester_time =
static_cast<uint64_t>(temp_current_time);
}
edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
f.fd.GetFileSize(), f.smallest_internal_key,
f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
false, kInvalidBlobFileNumber, oldest_ancester_time);
false, kInvalidBlobFileNumber, oldest_ancester_time,
current_time);
}
return status;
}

View File

@ -368,6 +368,7 @@ Status FlushJob::WriteLevel0Table() {
// It's not clear whether oldest_key_time is always available. In case
// it is not available, use current_time.
meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
meta_.file_creation_time = current_time;
s = BuildTable(
dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
@ -413,7 +414,7 @@ Status FlushJob::WriteLevel0Table() {
meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
meta_.marked_for_compaction, meta_.oldest_blob_file_number,
meta_.oldest_ancester_time);
meta_.oldest_ancester_time, meta_.file_creation_time);
}
#ifndef ROCKSDB_LITE
// Piggyback FlushJobInfo on the first first flushed memtable.

View File

@ -135,10 +135,12 @@ Status ImportColumnFamilyJob::Run() {
// We use the import time as the ancester time. This is the time the data
// is written to the database.
uint64_t oldest_ancester_time = 0;
int64_t temp_current_time = 0;
uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
uint64_t current_time = kUnknownOldestAncesterTime;
if (env_->GetCurrentTime(&temp_current_time).ok()) {
oldest_ancester_time = static_cast<uint64_t>(temp_current_time);
current_time = oldest_ancester_time =
static_cast<uint64_t>(temp_current_time);
}
for (size_t i = 0; i < files_to_import_.size(); ++i) {
@ -149,7 +151,7 @@ Status ImportColumnFamilyJob::Run() {
f.fd.GetFileSize(), f.smallest_internal_key,
f.largest_internal_key, file_metadata.smallest_seqno,
file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
oldest_ancester_time);
oldest_ancester_time, current_time);
// If incoming sequence number is higher, update local sequence number.
if (file_metadata.largest_seqno > versions_->LastSequence()) {

View File

@ -577,13 +577,13 @@ class Repairer {
// TODO(opt): separate out into multiple levels
for (const auto* table : cf_id_and_tables.second) {
edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
table->meta.fd.GetFileSize(), table->meta.smallest,
table->meta.largest, table->meta.fd.smallest_seqno,
table->meta.fd.largest_seqno,
table->meta.marked_for_compaction,
table->meta.oldest_blob_file_number,
table->meta.oldest_ancester_time);
edit.AddFile(
0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
table->meta.fd.GetFileSize(), table->meta.smallest,
table->meta.largest, table->meta.fd.smallest_seqno,
table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
table->meta.oldest_blob_file_number,
table->meta.oldest_ancester_time, table->meta.file_creation_time);
}
assert(next_file_number_ > 0);
vset_.MarkFileNumberUsed(next_file_number_ - 1);

View File

@ -63,7 +63,7 @@ class VersionBuilderTest : public testing::Test {
file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
/* marked_for_compact */ false, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
f->compensated_file_size = file_size;
f->num_entries = num_entries;
f->num_deletions = num_deletions;
@ -114,7 +114,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.DeleteFile(3, 27U);
EnvOptions env_options;
@ -149,7 +150,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
VersionEdit version_edit;
version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.DeleteFile(0, 1U);
version_edit.DeleteFile(0, 88U);
@ -187,7 +189,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
VersionEdit version_edit;
version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.DeleteFile(0, 1U);
version_edit.DeleteFile(0, 88U);
version_edit.DeleteFile(4, 6U);
@ -216,19 +219,24 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
GetInternalKey("450"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
GetInternalKey("650"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
GetInternalKey("550"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
GetInternalKey("750"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
EnvOptions env_options;
@ -255,30 +263,37 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
GetInternalKey("450"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
GetInternalKey("650"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
GetInternalKey("550"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
GetInternalKey("750"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_builder.Apply(&version_edit);
VersionEdit version_edit2;
version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
GetInternalKey("950"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_edit2.DeleteFile(2, 616);
version_edit2.DeleteFile(2, 636);
version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
GetInternalKey("850"), 200, 200, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
version_builder.Apply(&version_edit2);
version_builder.SaveTo(&new_vstorage);

View File

@ -62,6 +62,7 @@ enum CustomTag : uint32_t {
kMinLogNumberToKeepHack = 3,
kOldestBlobFileNumber = 4,
kOldestAncesterTime = 5,
kFileCreationTime = 6,
kPathId = 65,
};
// If this bit for the custom tag is set, opening DB should fail if
@ -217,6 +218,14 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
&varint_oldest_ancester_time);
PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
PutVarint32(dst, CustomTag::kFileCreationTime);
std::string varint_file_creation_time;
PutVarint64(&varint_file_creation_time, f.file_creation_time);
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
&varint_file_creation_time);
PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
if (f.fd.GetPathId() != 0) {
PutVarint32(dst, CustomTag::kPathId);
char p = static_cast<char>(f.fd.GetPathId());
@ -335,6 +344,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
return "invalid oldest ancester time";
}
break;
case kFileCreationTime:
if (!GetVarint64(&field, &f.file_creation_time)) {
return "invalid file creation time";
}
break;
case kNeedCompaction:
if (field.size() != 1) {
return "need_compaction field wrong size";
@ -660,6 +674,8 @@ std::string VersionEdit::DebugString(bool hex_key) const {
}
r.append(" oldest_ancester_time:");
AppendNumberTo(&r, f.oldest_ancester_time);
r.append(" file_creation_time:");
AppendNumberTo(&r, f.file_creation_time);
}
r.append("\n ColumnFamily: ");
AppendNumberTo(&r, column_family_);

View File

@ -26,6 +26,7 @@ class VersionSet;
constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
constexpr uint64_t kInvalidBlobFileNumber = 0;
constexpr uint64_t kUnknownOldestAncesterTime = 0;
constexpr uint64_t kUnknownFileCreationTime = 0;
extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
@ -128,7 +129,10 @@ struct FileMetaData {
// in turn be outputs for compact older SST files. We track the memtable
// flush timestamp for the oldest SST file that eventaully contribute data
// to this file. 0 means the information is not available.
uint64_t oldest_ancester_time = 0;
uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
// Unix time when the SST file is created.
uint64_t file_creation_time = kUnknownFileCreationTime;
FileMetaData() = default;
@ -136,13 +140,17 @@ struct FileMetaData {
const InternalKey& smallest_key, const InternalKey& largest_key,
const SequenceNumber& smallest_seq,
const SequenceNumber& largest_seq, bool marked_for_compact,
uint64_t oldest_blob_file, uint64_t _oldest_ancester_time)
uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
uint64_t _file_creation_time)
: fd(file, file_path_id, file_size, smallest_seq, largest_seq),
smallest(smallest_key),
largest(largest_key),
marked_for_compaction(marked_for_compact),
oldest_blob_file_number(oldest_blob_file),
oldest_ancester_time(_oldest_ancester_time) {}
oldest_ancester_time(_oldest_ancester_time),
file_creation_time(_file_creation_time) {
TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
}
// REQUIRED: Keys must be given to the function in sorted order (it expects
// the last key to be the largest).
@ -168,13 +176,23 @@ struct FileMetaData {
// if table reader is already pinned.
// 0 means the information is not available.
uint64_t TryGetOldestAncesterTime() {
if (oldest_ancester_time != 0) {
if (oldest_ancester_time != kUnknownOldestAncesterTime) {
return oldest_ancester_time;
} else if (fd.table_reader != nullptr &&
fd.table_reader->GetTableProperties() != nullptr) {
return fd.table_reader->GetTableProperties()->creation_time;
}
return 0;
return kUnknownOldestAncesterTime;
}
uint64_t TryGetFileCreationTime() {
if (file_creation_time != kUnknownFileCreationTime) {
return file_creation_time;
} else if (fd.table_reader != nullptr &&
fd.table_reader->GetTableProperties() != nullptr) {
return fd.table_reader->GetTableProperties()->file_creation_time;
}
return kUnknownFileCreationTime;
}
};
@ -277,14 +295,14 @@ class VersionEdit {
uint64_t file_size, const InternalKey& smallest,
const InternalKey& largest, const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno, bool marked_for_compaction,
uint64_t oldest_blob_file_number,
uint64_t oldest_ancester_time) {
uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
uint64_t file_creation_time) {
assert(smallest_seqno <= largest_seqno);
new_files_.emplace_back(
level,
FileMetaData(file, file_path_id, file_size, smallest, largest,
smallest_seqno, largest_seqno, marked_for_compaction,
oldest_blob_file_number, oldest_ancester_time));
level, FileMetaData(file, file_path_id, file_size, smallest, largest,
smallest_seqno, largest_seqno,
marked_for_compaction, oldest_blob_file_number,
oldest_ancester_time, file_creation_time));
}
void AddFile(int level, const FileMetaData& f) {

View File

@ -37,7 +37,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
InternalKey("foo", kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
888);
888, 678);
edit.DeleteFile(4, kBig + 700 + i);
}
@ -55,17 +55,19 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
kBig + 602, true, kInvalidBlobFileNumber, 666);
kBig + 602, true, kInvalidBlobFileNumber, 666, 888);
edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
kBig + 603, true, 1001, kUnknownOldestAncesterTime);
kBig + 603, true, 1001, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
;
edit.DeleteFile(4, 700);
@ -104,10 +106,10 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false, kInvalidBlobFileNumber, 686);
kBig + 601, false, kInvalidBlobFileNumber, 686, 868);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
@ -154,7 +156,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
@ -182,7 +184,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
TEST_F(VersionEditTest, EncodeEmptyFile) {
VersionEdit edit;
edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
std::string buffer;
ASSERT_TRUE(!edit.EncodeTo(&buffer));
}

View File

@ -1459,7 +1459,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
file->fd.largest_seqno, file->smallest.user_key().ToString(),
file->largest.user_key().ToString(),
file->stats.num_reads_sampled.load(std::memory_order_relaxed),
file->being_compacted, file->oldest_blob_file_number});
file->being_compacted, file->oldest_blob_file_number,
file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime()});
files.back().num_entries = file->num_entries;
files.back().num_deletions = file->num_deletions;
level_size += file->fd.GetFileSize();
@ -1485,10 +1486,9 @@ void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
assert(meta->fd.table_reader != nullptr);
uint64_t file_creation_time =
meta->fd.table_reader->GetTableProperties()->file_creation_time;
if (file_creation_time == 0) {
*creation_time = file_creation_time;
uint64_t file_creation_time = meta->TryGetFileCreationTime();
if (file_creation_time == kUnknownFileCreationTime) {
*creation_time = 0;
return;
}
if (file_creation_time < oldest_time) {
@ -2501,8 +2501,7 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
const ImmutableCFOptions& ioptions,
const uint64_t periodic_compaction_seconds) {
assert(periodic_compaction_seconds > 0 &&
periodic_compaction_seconds < port::kMaxUint64);
assert(periodic_compaction_seconds > 0);
files_marked_for_periodic_compaction_.clear();
@ -2513,8 +2512,8 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
}
const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
// If periodic_compaction_seconds > current_time, no file possibly qualifies
// periodic compaction.
// If periodic_compaction_seconds is larger than current time, periodic
// compaction can't possibly be triggered.
if (periodic_compaction_seconds > current_time) {
return;
}
@ -2524,20 +2523,18 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
for (int level = 0; level < num_levels(); level++) {
for (auto f : files_[level]) {
if (!f->being_compacted && f->fd.table_reader != nullptr &&
f->fd.table_reader->GetTableProperties() != nullptr) {
if (!f->being_compacted) {
// Compute a file's modification time in the following order:
// 1. Use file_creation_time table property if it is > 0.
// 2. Use creation_time table property if it is > 0.
// 3. Use file's mtime metadata if the above two table properties are 0.
// Don't consider the file at all if the modification time cannot be
// correctly determined based on the above conditions.
uint64_t file_modification_time =
f->fd.table_reader->GetTableProperties()->file_creation_time;
if (file_modification_time == 0) {
uint64_t file_modification_time = f->TryGetFileCreationTime();
if (file_modification_time == kUnknownFileCreationTime) {
file_modification_time = f->TryGetOldestAncesterTime();
}
if (file_modification_time == 0) {
if (file_modification_time == kUnknownOldestAncesterTime) {
auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
f->fd.GetPathId());
status = ioptions.env->GetFileModificationTime(
@ -4980,7 +4977,7 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->oldest_blob_file_number,
f->oldest_ancester_time);
f->oldest_ancester_time, f->file_creation_time);
}
}
edit.SetLogNumber(cfd->GetLogNumber());

View File

@ -40,7 +40,7 @@ class GenerateLevelFilesBriefTest : public testing::Test {
InternalKey(smallest, smallest_seq, kTypeValue),
InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
files_.push_back(f);
}
@ -135,7 +135,7 @@ class VersionStorageInfoTest : public testing::Test {
file_number, 0, file_size, GetInternalKey(smallest, 0),
GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
/* marked_for_compact */ false, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime);
kUnknownOldestAncesterTime, kUnknownFileCreationTime);
f->compensated_file_size = file_size;
vstorage_.AddFile(level, f);
}
@ -146,7 +146,8 @@ class VersionStorageInfoTest : public testing::Test {
FileMetaData* f = new FileMetaData(
file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
/* largest_seq */ 0, /* marked_for_compact */ false,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime);
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime);
f->compensated_file_size = file_size;
vstorage_.AddFile(level, f);
}

View File

@ -69,7 +69,8 @@ struct SstFileMetaData {
SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
const std::string& _smallestkey,
const std::string& _largestkey, uint64_t _num_reads_sampled,
bool _being_compacted, uint64_t _oldest_blob_file_number)
bool _being_compacted, uint64_t _oldest_blob_file_number,
uint64_t _oldest_ancester_time, uint64_t _file_creation_time)
: size(_size),
name(_file_name),
file_number(_file_number),
@ -82,7 +83,9 @@ struct SstFileMetaData {
being_compacted(_being_compacted),
num_entries(0),
num_deletions(0),
oldest_blob_file_number(_oldest_blob_file_number) {}
oldest_blob_file_number(_oldest_blob_file_number),
oldest_ancester_time(_oldest_ancester_time),
file_creation_time(_file_creation_time) {}
// File size in bytes.
size_t size;
@ -105,6 +108,15 @@ struct SstFileMetaData {
uint64_t oldest_blob_file_number; // The id of the oldest blob file
// referenced by the file.
// An SST file may be generated by compactions whose input files may
// in turn be generated by earlier compactions. The creation time of the
// oldest SST file that is the compaction ancester of this file.
// The timestamp is provided Env::GetCurrentTime().
// 0 if the information is not available.
uint64_t oldest_ancester_time;
// Timestamp when the SST file is created, provided by Env::GetCurrentTime().
// 0 if the information is not available.
uint64_t file_creation_time;
};
// The full set of metadata associated with each SST file.