mirror of https://github.com/facebook/rocksdb.git
In delete scheduler, before ftruncate file for slow delete, check whether there is other hard links (#4093)
Summary: Right now slow deletion with ftruncate doesn't work well with checkpoints because it ruin hard linked files in checkpoints. To fix it, check the file has no other hard link before ftruncate it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4093 Differential Revision: D8730360 Pulled By: siying fbshipit-source-id: 756eea5bce8a87b9a2ea3a5bfa190b2cab6f75df
This commit is contained in:
parent
b9846370e9
commit
926f3a78a6
|
@ -613,6 +613,15 @@ class PosixEnv : public Env {
|
|||
return result;
|
||||
}
|
||||
|
||||
Status NumFileLinks(const std::string& fname, uint64_t* count) override {
|
||||
struct stat s;
|
||||
if (stat(fname.c_str(), &s) != 0) {
|
||||
return IOError("while stat a file for num file links", fname, errno);
|
||||
}
|
||||
*count = static_cast<uint64_t>(s.st_nlink);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status AreFilesSame(const std::string& first,
|
||||
const std::string& second, bool* res) override {
|
||||
struct stat statbuf[2];
|
||||
|
|
|
@ -289,6 +289,12 @@ class Env {
|
|||
return Status::NotSupported("LinkFile is not supported for this Env");
|
||||
}
|
||||
|
||||
virtual Status NumFileLinks(const std::string& /*fname*/,
|
||||
uint64_t* /*count*/) {
|
||||
return Status::NotSupported(
|
||||
"Getting number of file links is not supported for this Env");
|
||||
}
|
||||
|
||||
virtual Status AreFilesSame(const std::string& /*first*/,
|
||||
const std::string& /*second*/, bool* /*res*/) {
|
||||
return Status::NotSupported("AreFilesSame is not supported for this Env");
|
||||
|
@ -1064,6 +1070,10 @@ class EnvWrapper : public Env {
|
|||
return target_->LinkFile(s, t);
|
||||
}
|
||||
|
||||
Status NumFileLinks(const std::string& fname, uint64_t* count) override {
|
||||
return target_->NumFileLinks(fname, count);
|
||||
}
|
||||
|
||||
Status AreFilesSame(const std::string& first, const std::string& second,
|
||||
bool* res) override {
|
||||
return target_->AreFilesSame(first, second, res);
|
||||
|
|
|
@ -96,14 +96,18 @@ class SstFileManager {
|
|||
// @param max_trash_db_ratio: If the trash size constitutes for more than this
|
||||
// fraction of the total DB size we will start deleting new files passed to
|
||||
// DeleteScheduler immediately
|
||||
// @param bytes_max_delete_chunk: if a single file is larger than delete chunk,
|
||||
// ftruncate the file by this size each time, rather than dropping the whole
|
||||
// file. 0 means to always delete the whole file. NOTE this options may not
|
||||
// work well with checkpoints, which relies on file system hard links.
|
||||
// @param bytes_max_delete_chunk: if a file to delete is larger than delete
|
||||
// chunk, ftruncate the file by this size each time, rather than dropping the
|
||||
// whole file. 0 means to always delete the whole file. If the file has more
|
||||
// than one linked names, the file will be deleted as a whole. Either way,
|
||||
// `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
|
||||
// files already renamed as a trash may be partial, so users should not
|
||||
// directly recover them without checking.
|
||||
extern SstFileManager* NewSstFileManager(
|
||||
Env* env, std::shared_ptr<Logger> info_log = nullptr,
|
||||
std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
|
||||
bool delete_existing_trash = true, Status* status = nullptr,
|
||||
double max_trash_db_ratio = 0.25, uint64_t bytes_max_delete_chunk = 0);
|
||||
double max_trash_db_ratio = 0.25,
|
||||
uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
|
|
@ -138,8 +138,7 @@ jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env,
|
|||
|
||||
const rocksdb::HashMapJni::FnMapKV<const std::string, const uint64_t>
|
||||
fn_map_kv =
|
||||
[env](
|
||||
const std::pair<const std::string, const uint64_t>& pair) {
|
||||
[env](const std::pair<const std::string, const uint64_t>& pair) {
|
||||
const jstring jtracked_file_path =
|
||||
env->NewStringUTF(pair.first.c_str());
|
||||
if (jtracked_file_path == nullptr) {
|
||||
|
|
|
@ -267,24 +267,46 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
|
|||
if (s.ok()) {
|
||||
bool need_full_delete = true;
|
||||
if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
|
||||
unique_ptr<WritableFile> wf;
|
||||
Status my_status =
|
||||
env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions());
|
||||
uint64_t num_hard_links = 2;
|
||||
// We don't have to worry aobut data race between linking a new
|
||||
// file after the number of file link check and ftruncte because
|
||||
// the file is now in trash and no hardlink is supposed to create
|
||||
// to trash files by RocksDB.
|
||||
Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links);
|
||||
if (my_status.ok()) {
|
||||
my_status = wf->Truncate(file_size - bytes_max_delete_chunk_);
|
||||
if (my_status.ok()) {
|
||||
TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
|
||||
my_status = wf->Fsync();
|
||||
if (num_hard_links == 1) {
|
||||
unique_ptr<WritableFile> wf;
|
||||
my_status =
|
||||
env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions());
|
||||
if (my_status.ok()) {
|
||||
my_status = wf->Truncate(file_size - bytes_max_delete_chunk_);
|
||||
if (my_status.ok()) {
|
||||
TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
|
||||
my_status = wf->Fsync();
|
||||
}
|
||||
}
|
||||
if (my_status.ok()) {
|
||||
*deleted_bytes = bytes_max_delete_chunk_;
|
||||
need_full_delete = false;
|
||||
*is_complete = false;
|
||||
} else {
|
||||
ROCKS_LOG_WARN(info_log_,
|
||||
"Failed to partially delete %s from trash -- %s",
|
||||
path_in_trash.c_str(), my_status.ToString().c_str());
|
||||
}
|
||||
} else {
|
||||
ROCKS_LOG_INFO(info_log_,
|
||||
"Cannot delete %s slowly through ftruncate from trash "
|
||||
"as it has other links",
|
||||
path_in_trash.c_str());
|
||||
}
|
||||
}
|
||||
if (my_status.ok()) {
|
||||
*deleted_bytes = bytes_max_delete_chunk_;
|
||||
need_full_delete = false;
|
||||
*is_complete = false;
|
||||
} else {
|
||||
ROCKS_LOG_WARN(info_log_,
|
||||
"Failed to partially delete %s from trash -- %s",
|
||||
path_in_trash.c_str(), my_status.ToString().c_str());
|
||||
} else if (!num_link_error_printed_) {
|
||||
ROCKS_LOG_INFO(
|
||||
info_log_,
|
||||
"Cannot delete files slowly through ftruncate from trash "
|
||||
"as Env::NumFileLinks() returns error: %s",
|
||||
my_status.ToString().c_str());
|
||||
num_link_error_printed_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -108,6 +108,8 @@ class DeleteScheduler {
|
|||
uint64_t bytes_max_delete_chunk_;
|
||||
// Errors that happened in BackgroundEmptyTrash (file_path => error)
|
||||
std::map<std::string, Status> bg_errors_;
|
||||
|
||||
bool num_link_error_printed_ = false;
|
||||
// Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
|
||||
bool closing_;
|
||||
// Condition variable signaled in these conditions
|
||||
|
|
|
@ -478,6 +478,40 @@ TEST_F(DeleteSchedulerTest, DeletePartialFile) {
|
|||
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
|
||||
int bg_delete_file = 0;
|
||||
int bg_fsync = 0;
|
||||
rocksdb::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::DeleteTrashFile:DeleteFile",
|
||||
[&](void*) { bg_delete_file++; });
|
||||
rocksdb::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
|
||||
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
|
||||
NewDeleteScheduler();
|
||||
|
||||
std::string file1 = NewDummyFile("data_1", 500 * 1024);
|
||||
std::string file2 = NewDummyFile("data_2", 100 * 1024);
|
||||
|
||||
ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
|
||||
ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
|
||||
|
||||
// Should delete in 4 batch if there is no hardlink
|
||||
ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
|
||||
ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
|
||||
|
||||
delete_scheduler_->WaitForEmptyTrash();
|
||||
|
||||
auto bg_errors = delete_scheduler_->GetBackgroundErrors();
|
||||
ASSERT_EQ(bg_errors.size(), 0);
|
||||
ASSERT_EQ(2, bg_delete_file);
|
||||
ASSERT_EQ(0, bg_fsync);
|
||||
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
|
||||
}
|
||||
#endif
|
||||
|
||||
// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
|
||||
// 2- Delete 100 files using DeleteScheduler
|
||||
// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
|
||||
|
|
Loading…
Reference in New Issue