rocksdb/db/import_column_family_job.cc
Peter Dillinger a53ed91691 Fix/improve temperature handling for file ingestion (#12402)
Summary:
Partly following up on leftovers from https://github.com/facebook/rocksdb/issues/12388

In terms of public API:
* Make it clear that IngestExternalFileArg::file_temperature is just a hint for opening the existing file, though it was previously used for both copy-from temp hint and copy-to temp, which was bizarre.
* Specify how IngestExternalFile assigns temperature to file ingested into DB. (See details in comments.) This approach is not perfect in terms of matching how the DB assigns temperatures, but was the simplest way to get close. The key complication for matching DB temperature assignments is that ingestion files are copied (to a destination temp) before their target level is determined (in general).
* Add a temperature option to SstFileWriter::Open so that files intended for ingestion can be initially written to a chosen temperature.
* Note that "fail_if_not_bottommost_level" is obsolete/confusing use of "bottommost"

In terms of the implementation, there was a similar bit of oddness with the internal CopyFile API, which only took one temperature, ambiguously applicable to the source, destination, or both. This is also fixed.

Eventual suggested follow-up:
* Before copying files for ingestion, determine a tentative level assignment to use for destination temperature, and keep that even if final level assignment happens to be different at commit time (rare).
* More temperature handling for CreateColumnFamilyWithImport and Checkpoints.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12402

Test Plan:
Deeply revamped
ExternalSSTFileBasicTest.IngestWithTemperature to test the new changes. Previously this test was insufficient because it was only looking at temperatures according to the DB manifest. Incorporating FileTemperatureTestFS allows us to also test the temperatures in the storage layer.

Used macros instead of functions for better tracing to critical source location on test failures.

Some enhancements to FileTemperatureTestFS in the process of developing the revamped test.

Reviewed By: jowlyzhang

Differential Revision: D54442794

Pulled By: pdillinger

fbshipit-source-id: 41d9d0afdc073e6a983304c10bbc07c70cc7e995
2024-03-05 16:56:08 -08:00

461 lines
17 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/version_builder.h"
#include "db/import_column_family_job.h"
#include <algorithm>
#include <cinttypes>
#include <string>
#include <vector>
#include "db/version_edit.h"
#include "file/file_util.h"
#include "file/random_access_file_reader.h"
#include "logging/logging.h"
#include "table/merging_iterator.h"
#include "table/scoped_arena_iterator.h"
#include "table/sst_file_writer_collectors.h"
#include "table/table_builder.h"
#include "table/unique_id_impl.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE {
Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
SuperVersion* sv) {
Status status;
std::vector<ColumnFamilyIngestFileInfo> cf_ingest_infos;
for (const auto& metadata_per_cf : metadatas_) {
// Read the information of files we are importing
ColumnFamilyIngestFileInfo cf_file_info;
InternalKey smallest, largest;
int num_files = 0;
std::vector<IngestedFileInfo> files_to_import_per_cf;
for (size_t i = 0; i < metadata_per_cf.size(); i++) {
auto file_metadata = *metadata_per_cf[i];
const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
IngestedFileInfo file_to_import;
status = GetIngestedFileInfo(file_path, next_file_number++, sv,
file_metadata, &file_to_import);
if (!status.ok()) {
return status;
}
if (file_to_import.num_entries == 0) {
status = Status::InvalidArgument("File contain no entries");
return status;
}
if (!file_to_import.smallest_internal_key.Valid() ||
!file_to_import.largest_internal_key.Valid()) {
status = Status::Corruption("File has corrupted keys");
return status;
}
files_to_import_per_cf.push_back(file_to_import);
num_files++;
// Calculate the smallest and largest keys of all files in this CF
if (i == 0) {
smallest = file_to_import.smallest_internal_key;
largest = file_to_import.largest_internal_key;
} else {
if (cfd_->internal_comparator().Compare(
smallest, file_to_import.smallest_internal_key) < 0) {
smallest = file_to_import.smallest_internal_key;
}
if (cfd_->internal_comparator().Compare(
largest, file_to_import.largest_internal_key) > 0) {
largest = file_to_import.largest_internal_key;
}
}
}
if (num_files == 0) {
status = Status::InvalidArgument("The list of files is empty");
return status;
}
files_to_import_.push_back(files_to_import_per_cf);
cf_file_info.smallest_internal_key = smallest;
cf_file_info.largest_internal_key = largest;
cf_ingest_infos.push_back(cf_file_info);
}
std::sort(cf_ingest_infos.begin(), cf_ingest_infos.end(),
[this](const ColumnFamilyIngestFileInfo& info1,
const ColumnFamilyIngestFileInfo& info2) {
return cfd_->user_comparator()->Compare(
info1.smallest_internal_key.user_key(),
info2.smallest_internal_key.user_key()) < 0;
});
for (size_t i = 0; i + 1 < cf_ingest_infos.size(); i++) {
if (cfd_->user_comparator()->Compare(
cf_ingest_infos[i].largest_internal_key.user_key(),
cf_ingest_infos[i + 1].smallest_internal_key.user_key()) >= 0) {
status = Status::InvalidArgument("CFs have overlapping ranges");
return status;
}
}
// Copy/Move external files into DB
auto hardlink_files = import_options_.move_files;
for (auto& files_to_import_per_cf : files_to_import_) {
for (auto& f : files_to_import_per_cf) {
const auto path_outside_db = f.external_file_path;
const auto path_inside_db = TableFileName(
cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
if (hardlink_files) {
status = fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(),
nullptr);
if (status.IsNotSupported()) {
// Original file is on a different FS, use copy instead of hard
// linking
hardlink_files = false;
ROCKS_LOG_INFO(db_options_.info_log,
"Try to link file %s but it's not supported : %s",
f.internal_file_path.c_str(),
status.ToString().c_str());
}
}
if (!hardlink_files) {
// FIXME: temperature handling (like ExternalSstFileIngestionJob)
status = CopyFile(fs_.get(), path_outside_db, Temperature::kUnknown,
path_inside_db, Temperature::kUnknown, 0,
db_options_.use_fsync, io_tracer_);
}
if (!status.ok()) {
break;
}
f.copy_file = !hardlink_files;
f.internal_file_path = path_inside_db;
}
if (!status.ok()) {
break;
}
}
if (!status.ok()) {
// We failed, remove all files that we copied into the db
for (auto& files_to_import_per_cf : files_to_import_) {
for (auto& f : files_to_import_per_cf) {
if (f.internal_file_path.empty()) {
break;
}
const auto s =
fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
if (!s.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"AddFile() clean up for file %s failed : %s",
f.internal_file_path.c_str(), s.ToString().c_str());
}
}
}
}
return status;
}
// REQUIRES: we have become the only writer by entering both write_thread_ and
// nonmem_write_thread_
Status ImportColumnFamilyJob::Run() {
// We use the import time as the ancester time. This is the time the data
// is written to the database.
int64_t temp_current_time = 0;
uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
uint64_t current_time = kUnknownOldestAncesterTime;
if (clock_->GetCurrentTime(&temp_current_time).ok()) {
current_time = oldest_ancester_time =
static_cast<uint64_t>(temp_current_time);
}
// Recover files' epoch number using dummy VersionStorageInfo
VersionBuilder dummy_version_builder(
cfd_->current()->version_set()->file_options(), cfd_->ioptions(),
cfd_->table_cache(), cfd_->current()->storage_info(),
cfd_->current()->version_set(),
cfd_->GetFileMetadataCacheReservationManager());
VersionStorageInfo dummy_vstorage(
&cfd_->internal_comparator(), cfd_->user_comparator(),
cfd_->NumberLevels(), cfd_->ioptions()->compaction_style,
nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks,
EpochNumberRequirement::kMightMissing, cfd_->ioptions()->clock,
cfd_->GetLatestMutableCFOptions()->bottommost_file_compaction_delay,
cfd_->current()->version_set()->offpeak_time_option());
Status s;
for (size_t i = 0; s.ok() && i < files_to_import_.size(); ++i) {
for (size_t j = 0; s.ok() && j < files_to_import_[i].size(); ++j) {
const auto& f = files_to_import_[i][j];
const auto& file_metadata = *metadatas_[i][j];
uint64_t tail_size = 0;
bool contain_no_data_blocks = f.table_properties.num_entries > 0 &&
(f.table_properties.num_entries ==
f.table_properties.num_range_deletions);
if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) {
uint64_t file_size = f.fd.GetFileSize();
assert(f.table_properties.tail_start_offset <= file_size);
tail_size = file_size - f.table_properties.tail_start_offset;
}
VersionEdit dummy_version_edit;
dummy_version_edit.AddFile(
file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key,
file_metadata.smallest_seqno, file_metadata.largest_seqno, false,
file_metadata.temperature, kInvalidBlobFileNumber,
oldest_ancester_time, current_time, file_metadata.epoch_number,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, f.unique_id, 0,
tail_size,
static_cast<bool>(
f.table_properties.user_defined_timestamps_persisted));
s = dummy_version_builder.Apply(&dummy_version_edit);
}
}
if (s.ok()) {
s = dummy_version_builder.SaveTo(&dummy_vstorage);
}
if (s.ok()) {
dummy_vstorage.RecoverEpochNumbers(cfd_);
}
// Record changes from this CF import in VersionEdit, including files with
// recovered epoch numbers
if (s.ok()) {
edit_.SetColumnFamily(cfd_->GetID());
for (int level = 0; level < dummy_vstorage.num_levels(); level++) {
for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
edit_.AddFile(level, *file_meta);
// If incoming sequence number is higher, update local sequence number.
if (file_meta->fd.largest_seqno > versions_->LastSequence()) {
versions_->SetLastAllocatedSequence(file_meta->fd.largest_seqno);
versions_->SetLastPublishedSequence(file_meta->fd.largest_seqno);
versions_->SetLastSequence(file_meta->fd.largest_seqno);
}
}
}
}
// Release resources occupied by the dummy VersionStorageInfo
for (int level = 0; level < dummy_vstorage.num_levels(); level++) {
for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
file_meta->refs--;
if (file_meta->refs <= 0) {
delete file_meta;
}
}
}
return s;
}
void ImportColumnFamilyJob::Cleanup(const Status& status) {
if (!status.ok()) {
// We failed to add files to the database remove all the files we copied.
for (auto& files_to_import_per_cf : files_to_import_) {
for (auto& f : files_to_import_per_cf) {
const auto s =
fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
if (!s.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"AddFile() clean up for file %s failed : %s",
f.internal_file_path.c_str(), s.ToString().c_str());
}
}
}
} else if (status.ok() && import_options_.move_files) {
// The files were moved and added successfully, remove original file links
for (auto& files_to_import_per_cf : files_to_import_) {
for (auto& f : files_to_import_per_cf) {
const auto s =
fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr);
if (!s.ok()) {
ROCKS_LOG_WARN(
db_options_.info_log,
"%s was added to DB successfully but failed to remove original "
"file link : %s",
f.external_file_path.c_str(), s.ToString().c_str());
}
}
}
}
}
Status ImportColumnFamilyJob::GetIngestedFileInfo(
const std::string& external_file, uint64_t new_file_number,
SuperVersion* sv, const LiveFileMetaData& file_meta,
IngestedFileInfo* file_to_import) {
file_to_import->external_file_path = external_file;
Status status;
if (file_meta.size > 0) {
file_to_import->file_size = file_meta.size;
} else {
// Get external file size
status = fs_->GetFileSize(external_file, IOOptions(),
&file_to_import->file_size, nullptr);
if (!status.ok()) {
return status;
}
}
// Assign FD with number
file_to_import->fd =
FileDescriptor(new_file_number, 0, file_to_import->file_size);
// Create TableReader for external file
std::unique_ptr<TableReader> table_reader;
std::unique_ptr<FSRandomAccessFile> sst_file;
std::unique_ptr<RandomAccessFileReader> sst_file_reader;
status =
fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
if (!status.ok()) {
return status;
}
sst_file_reader.reset(new RandomAccessFileReader(
std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
// TODO(yuzhangyu): User-defined timestamps doesn't support importing column
// family. Pass in the correct `user_defined_timestamps_persisted` flag for
// creating `TableReaderOptions` when the support is there.
status = cfd_->ioptions()->table_factory->NewTableReader(
TableReaderOptions(
*cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
env_options_, cfd_->internal_comparator(),
sv->mutable_cf_options.block_protection_bytes_per_key,
/*skip_filters*/ false, /*immortal*/ false,
/*force_direct_prefetch*/ false, /*level*/ -1,
/*block_cache_tracer*/ nullptr,
/*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
/*cur_file_num*/ new_file_number),
std::move(sst_file_reader), file_to_import->file_size, &table_reader);
if (!status.ok()) {
return status;
}
// Get the external file properties
auto props = table_reader->GetTableProperties();
// Set original_seqno to 0.
file_to_import->original_seqno = 0;
// Get number of entries in table
file_to_import->num_entries = props->num_entries;
// If the importing files were exported with Checkpoint::ExportColumnFamily(),
// we cannot simply recompute smallest and largest used to truncate range
// tombstones from file content, and we expect smallest and largest populated
// in file_meta.
if (file_meta.smallest.empty()) {
assert(file_meta.largest.empty());
// TODO: plumb Env::IOActivity, Env::IOPriority
ReadOptions ro;
std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
/*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
// Get first (smallest) key from file
iter->SeekToFirst();
bool bound_set = false;
if (iter->Valid()) {
file_to_import->smallest_internal_key.DecodeFrom(iter->key());
Slice largest;
if (strcmp(cfd_->ioptions()->table_factory->Name(), "PlainTable") == 0) {
// PlainTable iterator does not support SeekToLast().
largest = iter->key();
for (; iter->Valid(); iter->Next()) {
if (cfd_->internal_comparator().Compare(iter->key(), largest) > 0) {
largest = iter->key();
}
}
if (!iter->status().ok()) {
return iter->status();
}
} else {
iter->SeekToLast();
if (!iter->Valid()) {
if (iter->status().ok()) {
// The file contains at least 1 key since iter is valid after
// SeekToFirst().
return Status::Corruption("Can not find largest key in sst file");
} else {
return iter->status();
}
}
largest = iter->key();
}
file_to_import->largest_internal_key.DecodeFrom(largest);
bound_set = true;
} else if (!iter->status().ok()) {
return iter->status();
}
std::unique_ptr<InternalIterator> range_del_iter{
table_reader->NewRangeTombstoneIterator(ro)};
if (range_del_iter != nullptr) {
range_del_iter->SeekToFirst();
if (range_del_iter->Valid()) {
ParsedInternalKey key;
Status pik_status = ParseInternalKey(range_del_iter->key(), &key,
db_options_.allow_data_in_errors);
if (!pik_status.ok()) {
return Status::Corruption("Corrupted key in external file. ",
pik_status.getState());
}
RangeTombstone first_tombstone(key, range_del_iter->value());
InternalKey start_key = first_tombstone.SerializeKey();
const InternalKeyComparator* icmp = &cfd_->internal_comparator();
if (!bound_set ||
icmp->Compare(start_key, file_to_import->smallest_internal_key) <
0) {
file_to_import->smallest_internal_key = start_key;
}
range_del_iter->SeekToLast();
pik_status = ParseInternalKey(range_del_iter->key(), &key,
db_options_.allow_data_in_errors);
if (!pik_status.ok()) {
return Status::Corruption("Corrupted key in external file. ",
pik_status.getState());
}
RangeTombstone last_tombstone(key, range_del_iter->value());
InternalKey end_key = last_tombstone.SerializeEndKey();
if (!bound_set ||
icmp->Compare(end_key, file_to_import->largest_internal_key) > 0) {
file_to_import->largest_internal_key = end_key;
}
bound_set = true;
}
}
assert(bound_set);
} else {
assert(!file_meta.largest.empty());
file_to_import->smallest_internal_key.DecodeFrom(file_meta.smallest);
file_to_import->largest_internal_key.DecodeFrom(file_meta.largest);
}
file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
file_to_import->table_properties = *props;
auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
props->orig_file_number,
&(file_to_import->unique_id));
if (!s.ok()) {
ROCKS_LOG_WARN(db_options_.info_log,
"Failed to get SST unique id for file %s",
file_to_import->internal_file_path.c_str());
}
return status;
}
} // namespace ROCKSDB_NAMESPACE