rocksdb/db/experimental.cc
Changyu Bi cc6f323705 Include estimated bytes deleted by range tombstones in compensated file size (#10734)
Summary:
compensate file sizes in compaction picking so files with range tombstones are preferred, such that they get compacted down earlier as they tend to delete a lot of data. This PR adds a `compensated_range_deletion_size` field in FileMeta that is computed during Flush/Compaction and persisted in MANIFEST. This value is added to `compensated_file_size` which will be used for compaction picking. Currently, for a file in level L, `compensated_range_deletion_size` is set to the estimated bytes deleted by range tombstone of this file in all levels > L. This helps to reduce space amp when data in older levels are covered by range tombstones in level L.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10734

Test Plan:
- Added unit tests.
- benchmark to check if the above definition `compensated_range_deletion_size` is reducing space amp as intended, without affecting write amp too much. The experiment set up favorable for this optimization: large range tombstone issued infrequently. Command used:
```
./db_bench -benchmarks=fillrandom,waitforcompaction,stats,levelstats -use_existing_db=false -avoid_flush_during_recovery=true -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -max_bytes_for_level_base=134217728 -target_file_size_base=33554432 -writes_per_range_tombstone=500000 -range_tombstone_width=5000000 -num=50000000 -benchmark_write_rate_limit=8388608 -threads=16 -duration=1800 --max_num_range_tombstones=1000000000
```

In this experiment, each thread wrote 16 range tombstones over the duration of 30 minutes, each range tombstone has width 5M that is the 10% of the key space width. Results shows this PR generates a smaller DB size.

Compaction stats from this PR:
```
Level    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0      2/0   31.54 MB   0.5      0.0     0.0      0.0       8.4      8.4       0.0   1.0      0.0     63.4    135.56            110.94       544    0.249       0      0       0.0       0.0
  L4      3/0   96.55 MB   0.8     18.5     6.7     11.8      18.4      6.6       0.0   2.7     65.3     64.9    290.08            284.03       108    2.686    284M  1957K       0.0       0.0
  L5     15/0   404.41 MB   1.0     19.1     7.7     11.4      18.8      7.4       0.3   2.5     66.6     65.7    292.93            285.34       220    1.332    293M  3808K       0.0       0.0
  L6    143/0    4.12 GB   0.0     45.0     7.5     37.5      41.6      4.1       0.0   5.5     71.2     65.9    647.00            632.66       251    2.578    739M    47M       0.0       0.0
 Sum    163/0    4.64 GB   0.0     82.6    21.9     60.7      87.2     26.5       0.3  10.4     61.9     65.4   1365.58           1312.97      1123    1.216   1318M    52M       0.0       0.0
```

Compaction stats from main:
```
Level    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0      0/0    0.00 KB   0.0      0.0     0.0      0.0       8.4      8.4       0.0   1.0      0.0     60.5    142.12            115.89       569    0.250       0      0       0.0       0.0
  L4      3/0   85.68 MB   1.0     17.7     6.8     10.9      17.6      6.7       0.0   2.6     62.7     62.3    289.05            281.79       112    2.581    272M  2309K       0.0       0.0
  L5     11/0   293.73 MB   1.0     18.8     7.5     11.2      18.5      7.2       0.5   2.5     64.9     63.9    296.07            288.50       220    1.346    288M  4365K       0.0       0.0
  L6    130/0    3.94 GB   0.0     51.5     7.6     43.9      47.9      3.9       0.0   6.3     67.2     62.4    784.95            765.92       258    3.042    848M    51M       0.0       0.0
 Sum    144/0    4.31 GB   0.0     88.0    21.9     66.0      92.3     26.3       0.5  11.0     59.6     62.5   1512.19           1452.09      1159    1.305   1409M    58M       0.0       0.0```

Reviewed By: ajkr

Differential Revision: D39834713

Pulled By: cbi42

fbshipit-source-id: fe9341040b8704a8fbb10cad5cf5c43e962c7e6b
2022-12-29 13:28:24 -08:00

157 lines
4.8 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "rocksdb/experimental.h"
#include "db/db_impl/db_impl.h"
#include "db/version_util.h"
#include "logging/logging.h"
namespace ROCKSDB_NAMESPACE {
namespace experimental {
#ifndef ROCKSDB_LITE
Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
const Slice* begin, const Slice* end) {
if (db == nullptr) {
return Status::InvalidArgument("DB is empty");
}
return db->SuggestCompactRange(column_family, begin, end);
}
Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
if (db == nullptr) {
return Status::InvalidArgument("Didn't recognize DB object");
}
return db->PromoteL0(column_family, target_level);
}
#else // ROCKSDB_LITE
Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
const Slice* /*begin*/, const Slice* /*end*/) {
return Status::NotSupported("Not supported in RocksDB LITE");
}
Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
int /*target_level*/) {
return Status::NotSupported("Not supported in RocksDB LITE");
}
#endif // ROCKSDB_LITE
Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
}
Status UpdateManifestForFilesState(
const DBOptions& db_opts, const std::string& db_name,
const std::vector<ColumnFamilyDescriptor>& column_families,
const UpdateManifestForFilesStateOptions& opts) {
OfflineManifestWriter w(db_opts, db_name);
Status s = w.Recover(column_families);
size_t files_updated = 0;
size_t cfs_updated = 0;
auto fs = db_opts.env->GetFileSystem();
for (auto cfd : *w.Versions().GetColumnFamilySet()) {
if (!s.ok()) {
break;
}
assert(cfd);
if (cfd->IsDropped() || !cfd->initialized()) {
continue;
}
const auto* current = cfd->current();
assert(current);
const auto* vstorage = current->storage_info();
assert(vstorage);
VersionEdit edit;
edit.SetColumnFamily(cfd->GetID());
/* SST files */
for (int level = 0; level < cfd->NumberLevels(); level++) {
if (!s.ok()) {
break;
}
const auto& level_files = vstorage->LevelFiles(level);
for (const auto& lf : level_files) {
assert(lf);
uint64_t number = lf->fd.GetNumber();
std::string fname =
TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId());
std::unique_ptr<FSSequentialFile> f;
FileOptions fopts;
// Use kUnknown to signal the FileSystem to search all tiers for the
// file.
fopts.temperature = Temperature::kUnknown;
IOStatus file_ios =
fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr);
if (file_ios.ok()) {
if (opts.update_temperatures) {
Temperature temp = f->GetTemperature();
if (temp != Temperature::kUnknown && temp != lf->temperature) {
// Current state inconsistent with manifest
++files_updated;
edit.DeleteFile(level, number);
edit.AddFile(
level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
lf->smallest, lf->largest, lf->fd.smallest_seqno,
lf->fd.largest_seqno, lf->marked_for_compaction, temp,
lf->oldest_blob_file_number, lf->oldest_ancester_time,
lf->file_creation_time, lf->epoch_number, lf->file_checksum,
lf->file_checksum_func_name, lf->unique_id,
lf->compensated_range_deletion_size);
}
}
} else {
s = file_ios;
break;
}
}
}
if (s.ok() && edit.NumEntries() > 0) {
std::unique_ptr<FSDirectory> db_dir;
s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr);
if (s.ok()) {
s = w.LogAndApply(cfd, &edit, db_dir.get());
}
if (s.ok()) {
++cfs_updated;
}
}
}
if (cfs_updated > 0) {
ROCKS_LOG_INFO(db_opts.info_log,
"UpdateManifestForFilesState: updated %zu files in %zu CFs",
files_updated, cfs_updated);
} else if (s.ok()) {
ROCKS_LOG_INFO(db_opts.info_log,
"UpdateManifestForFilesState: no updates needed");
}
if (!s.ok()) {
ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s",
s.ToString().c_str());
}
return s;
}
} // namespace experimental
} // namespace ROCKSDB_NAMESPACE