mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 02:44:18 +00:00
cc6f323705
Summary: compensate file sizes in compaction picking so files with range tombstones are preferred, such that they get compacted down earlier as they tend to delete a lot of data. This PR adds a `compensated_range_deletion_size` field in FileMeta that is computed during Flush/Compaction and persisted in MANIFEST. This value is added to `compensated_file_size` which will be used for compaction picking. Currently, for a file in level L, `compensated_range_deletion_size` is set to the estimated bytes deleted by range tombstone of this file in all levels > L. This helps to reduce space amp when data in older levels are covered by range tombstones in level L. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10734 Test Plan: - Added unit tests. - benchmark to check if the above definition `compensated_range_deletion_size` is reducing space amp as intended, without affecting write amp too much. The experiment set up favorable for this optimization: large range tombstone issued infrequently. Command used: ``` ./db_bench -benchmarks=fillrandom,waitforcompaction,stats,levelstats -use_existing_db=false -avoid_flush_during_recovery=true -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -max_bytes_for_level_base=134217728 -target_file_size_base=33554432 -writes_per_range_tombstone=500000 -range_tombstone_width=5000000 -num=50000000 -benchmark_write_rate_limit=8388608 -threads=16 -duration=1800 --max_num_range_tombstones=1000000000 ``` In this experiment, each thread wrote 16 range tombstones over the duration of 30 minutes, each range tombstone has width 5M that is the 10% of the key space width. Results shows this PR generates a smaller DB size. Compaction stats from this PR: ``` Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 2/0 31.54 MB 0.5 0.0 0.0 0.0 8.4 8.4 0.0 1.0 0.0 63.4 135.56 110.94 544 0.249 0 0 0.0 0.0 L4 3/0 96.55 MB 0.8 18.5 6.7 11.8 18.4 6.6 0.0 2.7 65.3 64.9 290.08 284.03 108 2.686 284M 1957K 0.0 0.0 L5 15/0 404.41 MB 1.0 19.1 7.7 11.4 18.8 7.4 0.3 2.5 66.6 65.7 292.93 285.34 220 1.332 293M 3808K 0.0 0.0 L6 143/0 4.12 GB 0.0 45.0 7.5 37.5 41.6 4.1 0.0 5.5 71.2 65.9 647.00 632.66 251 2.578 739M 47M 0.0 0.0 Sum 163/0 4.64 GB 0.0 82.6 21.9 60.7 87.2 26.5 0.3 10.4 61.9 65.4 1365.58 1312.97 1123 1.216 1318M 52M 0.0 0.0 ``` Compaction stats from main: ``` Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 0/0 0.00 KB 0.0 0.0 0.0 0.0 8.4 8.4 0.0 1.0 0.0 60.5 142.12 115.89 569 0.250 0 0 0.0 0.0 L4 3/0 85.68 MB 1.0 17.7 6.8 10.9 17.6 6.7 0.0 2.6 62.7 62.3 289.05 281.79 112 2.581 272M 2309K 0.0 0.0 L5 11/0 293.73 MB 1.0 18.8 7.5 11.2 18.5 7.2 0.5 2.5 64.9 63.9 296.07 288.50 220 1.346 288M 4365K 0.0 0.0 L6 130/0 3.94 GB 0.0 51.5 7.6 43.9 47.9 3.9 0.0 6.3 67.2 62.4 784.95 765.92 258 3.042 848M 51M 0.0 0.0 Sum 144/0 4.31 GB 0.0 88.0 21.9 66.0 92.3 26.3 0.5 11.0 59.6 62.5 1512.19 1452.09 1159 1.305 1409M 58M 0.0 0.0``` Reviewed By: ajkr Differential Revision: D39834713 Pulled By: cbi42 fbshipit-source-id: fe9341040b8704a8fbb10cad5cf5c43e962c7e6b
157 lines
4.8 KiB
C++
157 lines
4.8 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "rocksdb/experimental.h"
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
#include "db/version_util.h"
|
|
#include "logging/logging.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace experimental {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
|
|
const Slice* begin, const Slice* end) {
|
|
if (db == nullptr) {
|
|
return Status::InvalidArgument("DB is empty");
|
|
}
|
|
|
|
return db->SuggestCompactRange(column_family, begin, end);
|
|
}
|
|
|
|
Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
|
|
if (db == nullptr) {
|
|
return Status::InvalidArgument("Didn't recognize DB object");
|
|
}
|
|
return db->PromoteL0(column_family, target_level);
|
|
}
|
|
|
|
#else // ROCKSDB_LITE
|
|
|
|
Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
|
|
const Slice* /*begin*/, const Slice* /*end*/) {
|
|
return Status::NotSupported("Not supported in RocksDB LITE");
|
|
}
|
|
|
|
Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
|
|
int /*target_level*/) {
|
|
return Status::NotSupported("Not supported in RocksDB LITE");
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
|
|
return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
|
|
}
|
|
|
|
Status UpdateManifestForFilesState(
|
|
const DBOptions& db_opts, const std::string& db_name,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
const UpdateManifestForFilesStateOptions& opts) {
|
|
OfflineManifestWriter w(db_opts, db_name);
|
|
Status s = w.Recover(column_families);
|
|
|
|
size_t files_updated = 0;
|
|
size_t cfs_updated = 0;
|
|
auto fs = db_opts.env->GetFileSystem();
|
|
|
|
for (auto cfd : *w.Versions().GetColumnFamilySet()) {
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
assert(cfd);
|
|
|
|
if (cfd->IsDropped() || !cfd->initialized()) {
|
|
continue;
|
|
}
|
|
|
|
const auto* current = cfd->current();
|
|
assert(current);
|
|
|
|
const auto* vstorage = current->storage_info();
|
|
assert(vstorage);
|
|
|
|
VersionEdit edit;
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
/* SST files */
|
|
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
const auto& level_files = vstorage->LevelFiles(level);
|
|
|
|
for (const auto& lf : level_files) {
|
|
assert(lf);
|
|
|
|
uint64_t number = lf->fd.GetNumber();
|
|
std::string fname =
|
|
TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId());
|
|
|
|
std::unique_ptr<FSSequentialFile> f;
|
|
FileOptions fopts;
|
|
// Use kUnknown to signal the FileSystem to search all tiers for the
|
|
// file.
|
|
fopts.temperature = Temperature::kUnknown;
|
|
|
|
IOStatus file_ios =
|
|
fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr);
|
|
if (file_ios.ok()) {
|
|
if (opts.update_temperatures) {
|
|
Temperature temp = f->GetTemperature();
|
|
if (temp != Temperature::kUnknown && temp != lf->temperature) {
|
|
// Current state inconsistent with manifest
|
|
++files_updated;
|
|
edit.DeleteFile(level, number);
|
|
edit.AddFile(
|
|
level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
|
|
lf->smallest, lf->largest, lf->fd.smallest_seqno,
|
|
lf->fd.largest_seqno, lf->marked_for_compaction, temp,
|
|
lf->oldest_blob_file_number, lf->oldest_ancester_time,
|
|
lf->file_creation_time, lf->epoch_number, lf->file_checksum,
|
|
lf->file_checksum_func_name, lf->unique_id,
|
|
lf->compensated_range_deletion_size);
|
|
}
|
|
}
|
|
} else {
|
|
s = file_ios;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (s.ok() && edit.NumEntries() > 0) {
|
|
std::unique_ptr<FSDirectory> db_dir;
|
|
s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr);
|
|
if (s.ok()) {
|
|
s = w.LogAndApply(cfd, &edit, db_dir.get());
|
|
}
|
|
if (s.ok()) {
|
|
++cfs_updated;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cfs_updated > 0) {
|
|
ROCKS_LOG_INFO(db_opts.info_log,
|
|
"UpdateManifestForFilesState: updated %zu files in %zu CFs",
|
|
files_updated, cfs_updated);
|
|
} else if (s.ok()) {
|
|
ROCKS_LOG_INFO(db_opts.info_log,
|
|
"UpdateManifestForFilesState: no updates needed");
|
|
}
|
|
if (!s.ok()) {
|
|
ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s",
|
|
s.ToString().c_str());
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
} // namespace experimental
|
|
} // namespace ROCKSDB_NAMESPACE
|